跳转至

pop2piano

mindnlp.transformers.models.pop2piano.modeling_pop2piano

PyTorch Pop2Piano model.

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoAttention

Bases: Module

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
class Pop2PianoAttention(nn.Module):
    def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False):
        super().__init__()
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.relative_attention_max_distance = config.relative_attention_max_distance
        self.d_model = config.d_model
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # Mesh TensorFlow initialization to avoid scaling before softmax
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
        )
        # Prune linear layers
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(mindspore.int64) * num_buckets
            relative_position = ops.abs(relative_position)
        else:
            relative_position = -ops.minimum(relative_position, ops.zeros_like(relative_position))
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_position_if_large = max_exact + (
            ops.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(mindspore.int64)
        relative_position_if_large = ops.minimum(
            relative_position_if_large, ops.full_like(relative_position_if_large, num_buckets - 1)
        )

        relative_buckets += ops.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets

    def compute_bias(self, query_length, key_length):
        """Compute binned relative position bias"""
        context_position = ops.arange(query_length, dtype=mindspore.int64)[:, None]
        memory_position = ops.arange(key_length, dtype=mindspore.int64)[None, :]
        relative_position = memory_position - context_position  # shape (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # shape (query_length, key_length)
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
        return values

    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
        batch_size, seq_length = hidden_states.shape[:2]

        real_seq_length = seq_length

        if past_key_value is not None:
            if len(past_key_value) != 2:
                raise ValueError(
                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
                )
            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length

        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]

        def shape(states):
            """projection"""
            return ops.transpose(states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim), 1, 2)

        def unshape(states):
            """reshape"""
            return ops.transpose(states, 1, 2).view(batch_size, -1, self.inner_dim)

        def project(hidden_states, proj_layer, key_value_states, past_key_value):
            """projects hidden states correctly to key/query states"""
            if key_value_states is None:
                # self-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(hidden_states))
            elif past_key_value is None:
                # cross-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(key_value_states))

            if past_key_value is not None:
                if key_value_states is None:
                    # self-attn
                    # (batch_size, n_heads, key_length, dim_per_head)
                    hidden_states = ops.cat([past_key_value, hidden_states], dim=2)
                elif past_key_value.shape[2] != key_value_states.shape[1]:
                    # checking that the `sequence_length` of the `past_key_value` is the same as
                    # the provided `key_value_states` to support prefix tuning
                    # cross-attn
                    # (batch_size, n_heads, seq_length, dim_per_head)
                    hidden_states = shape(proj_layer(key_value_states))
                else:
                    # cross-attn
                    hidden_states = past_key_value
            return hidden_states

        # get query states
        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)

        # get key/value states
        key_states = project(
            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
        )
        value_states = project(
            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
        )

        # compute scores
        scores = ops.matmul(
            query_states, ops.transpose(key_states, 3, 2)
        )  # equivalent of ops.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

        if position_bias is None:
            if not self.has_relative_attention_bias:
                position_bias = ops.zeros(
                    (1, self.n_heads, real_seq_length, key_length), dtype=scores.dtype
                )
                if self.gradient_checkpointing and self.training:
                    position_bias.requires_grad = True
            else:
                position_bias = self.compute_bias(real_seq_length, key_length)

            # if key and values are already calculated
            # we want only the last query position bias
            if past_key_value is not None:
                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]

            if mask is not None:
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)

        if self.pruned_heads:
            mask = ops.ones(position_bias.shape[1])
            mask[list(self.pruned_heads)] = 0
            position_bias_masked = position_bias[:, mask.bool()]
        else:
            position_bias_masked = position_bias

        scores += position_bias_masked
        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )  # (batch_size, n_heads, seq_length, key_length)

        # Mask heads if we want to
        if layer_head_mask is not None:
            attn_weights = attn_weights * layer_head_mask

        attn_output = unshape(ops.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
        attn_output = self.o(attn_output)

        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

        if output_attentions:
            outputs = outputs + (attn_weights,)
        return outputs

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoAttention.compute_bias(query_length, key_length)

Compute binned relative position bias

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def compute_bias(self, query_length, key_length):
    """Compute binned relative position bias"""
    context_position = ops.arange(query_length, dtype=mindspore.int64)[:, None]
    memory_position = ops.arange(key_length, dtype=mindspore.int64)[None, :]
    relative_position = memory_position - context_position  # shape (query_length, key_length)
    relative_position_bucket = self._relative_position_bucket(
        relative_position,  # shape (query_length, key_length)
        bidirectional=(not self.is_decoder),
        num_buckets=self.relative_attention_num_buckets,
        max_distance=self.relative_attention_max_distance,
    )
    values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
    values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
    return values

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoAttention.forward(hidden_states, mask=None, key_value_states=None, position_bias=None, past_key_value=None, layer_head_mask=None, query_length=None, use_cache=False, output_attentions=False)

Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def forward(
    self,
    hidden_states,
    mask=None,
    key_value_states=None,
    position_bias=None,
    past_key_value=None,
    layer_head_mask=None,
    query_length=None,
    use_cache=False,
    output_attentions=False,
):
    """
    Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
    """
    # Input is (batch_size, seq_length, dim)
    # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
    # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
    batch_size, seq_length = hidden_states.shape[:2]

    real_seq_length = seq_length

    if past_key_value is not None:
        if len(past_key_value) != 2:
            raise ValueError(
                f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
            )
        real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length

    key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]

    def shape(states):
        """projection"""
        return ops.transpose(states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim), 1, 2)

    def unshape(states):
        """reshape"""
        return ops.transpose(states, 1, 2).view(batch_size, -1, self.inner_dim)

    def project(hidden_states, proj_layer, key_value_states, past_key_value):
        """projects hidden states correctly to key/query states"""
        if key_value_states is None:
            # self-attn
            # (batch_size, n_heads, seq_length, dim_per_head)
            hidden_states = shape(proj_layer(hidden_states))
        elif past_key_value is None:
            # cross-attn
            # (batch_size, n_heads, seq_length, dim_per_head)
            hidden_states = shape(proj_layer(key_value_states))

        if past_key_value is not None:
            if key_value_states is None:
                # self-attn
                # (batch_size, n_heads, key_length, dim_per_head)
                hidden_states = ops.cat([past_key_value, hidden_states], dim=2)
            elif past_key_value.shape[2] != key_value_states.shape[1]:
                # checking that the `sequence_length` of the `past_key_value` is the same as
                # the provided `key_value_states` to support prefix tuning
                # cross-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(key_value_states))
            else:
                # cross-attn
                hidden_states = past_key_value
        return hidden_states

    # get query states
    query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)

    # get key/value states
    key_states = project(
        hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
    )
    value_states = project(
        hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
    )

    # compute scores
    scores = ops.matmul(
        query_states, ops.transpose(key_states, 3, 2)
    )  # equivalent of ops.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

    if position_bias is None:
        if not self.has_relative_attention_bias:
            position_bias = ops.zeros(
                (1, self.n_heads, real_seq_length, key_length), dtype=scores.dtype
            )
            if self.gradient_checkpointing and self.training:
                position_bias.requires_grad = True
        else:
            position_bias = self.compute_bias(real_seq_length, key_length)

        # if key and values are already calculated
        # we want only the last query position bias
        if past_key_value is not None:
            position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]

        if mask is not None:
            position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)

    if self.pruned_heads:
        mask = ops.ones(position_bias.shape[1])
        mask[list(self.pruned_heads)] = 0
        position_bias_masked = position_bias[:, mask.bool()]
    else:
        position_bias_masked = position_bias

    scores += position_bias_masked
    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
        scores
    )  # (batch_size, n_heads, seq_length, key_length)
    attn_weights = nn.functional.dropout(
        attn_weights, p=self.dropout, training=self.training
    )  # (batch_size, n_heads, seq_length, key_length)

    # Mask heads if we want to
    if layer_head_mask is not None:
        attn_weights = attn_weights * layer_head_mask

    attn_output = unshape(ops.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
    attn_output = self.o(attn_output)

    present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
    outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

    if output_attentions:
        outputs = outputs + (attn_weights,)
    return outputs

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoConcatEmbeddingToMel

Bases: Module

Embedding Matrix for composer tokens.

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
868
869
870
871
872
873
874
875
876
877
878
879
class Pop2PianoConcatEmbeddingToMel(nn.Module):
    """Embedding Matrix for `composer` tokens."""

    def __init__(self, config):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=config.composer_vocab_size, embedding_dim=config.d_model)

    def forward(self, feature, index_value, embedding_offset):
        index_shifted = index_value - embedding_offset
        composer_embedding = self.embedding(index_shifted).unsqueeze(1)
        inputs_embeds = ops.cat([composer_embedding, feature], dim=1)
        return inputs_embeds

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoForConditionalGeneration

Bases: Pop2PianoPreTrainedModel

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel):
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    def __init__(self, config: Pop2PianoConfig):
        super().__init__(config)
        self.config = config
        self.model_dim = config.d_model

        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        self.mel_conditioner = Pop2PianoConcatEmbeddingToMel(config)

        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False

        self.encoder = Pop2PianoStack(encoder_config, self.shared)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = Pop2PianoStack(decoder_config, self.shared)

        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def get_output_embeddings(self):
        return self.lm_head

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def get_mel_conditioner_outputs(
        self,
        input_features: mindspore.Tensor,
        composer: str,
        generation_config: GenerationConfig,
        attention_mask: mindspore.Tensor = None,
    ):
        """
        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
        control the type of MIDI token generated by the model.

        Args:
            input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                input features extracted from the feature extractor.
            composer (`str`):
                composer token which determines the type of MIDI tokens to be generated.
            generation_config (`~generation.GenerationConfig`):
                The generation is used to get the composer-feature_token pair.
            attention_mask (``, *optional*):
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
        """
        composer_to_feature_token = generation_config.composer_to_feature_token
        if composer not in composer_to_feature_token.keys():
            raise ValueError(
                f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
            )
        composer_value = composer_to_feature_token[composer]
        composer_value = mindspore.tensor(composer_value)
        composer_value = composer_value.tile((input_features.shape[0],))

        embedding_offset = min(composer_to_feature_token.values())

        input_features = self.mel_conditioner(
            feature=input_features,
            index_value=composer_value,
            embedding_offset=embedding_offset,
        )
        if attention_mask is not None:
            input_features[~attention_mask[:, 0].bool()] = 0.0

            # since self.mel_conditioner adds a new array at the front of inputs_embeds we need to do the same for attention_mask to keep the shapes same
            attention_mask = ops.concatenate([attention_mask[:, 0].view(-1, 1), attention_mask], dim=1)
            return input_features, attention_mask

        return input_features, None

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        decoder_input_ids: Optional[mindspore.Tensor] = None,
        decoder_attention_mask: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        decoder_head_mask: Optional[mindspore.Tensor] = None,
        cross_attn_head_mask: Optional[mindspore.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        input_features: Optional[mindspore.Tensor] = None,
        decoder_inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], Seq2SeqLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`
        Returns:
        """
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is not None and input_features is not None:
            raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them")
        elif input_features is not None and inputs_embeds is None:
            inputs_embeds = input_features

        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]

        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = decoder_outputs[0]

        if self.config.tie_word_embeddings:
            # Rescale output before projecting on vocab
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim**-0.5)

        lm_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    @no_grad()
    def generate(
        self,
        input_features,
        attention_mask=None,
        composer="composer1",
        generation_config=None,
        **kwargs,
    ):
        """
        Generates token ids for midi outputs.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
        strategies and code examples, check out the [following guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
            attention_mask:
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
            composer (`str`, *optional*, defaults to `"composer1"`):
                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
                `"composer"`. Please make sure that the composet value is present in `composer_to_feature_token` in
                `generation_config`. For an example please see
                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
        Return:
            [`~utils.ModelOutput`] or `mindspore.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `mindspore.Tensor`.
                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:
                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        """

        if generation_config is None:
            generation_config = self.generation_config
        generation_config.update(**kwargs)

        # check for composer_to_feature_token
        if not hasattr(generation_config, "composer_to_feature_token"):
            raise ValueError(
                "`composer_to_feature_token` was not found! Please refer to "
                "https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json"
                "and parse a dict like that."
            )

        if len(generation_config.composer_to_feature_token) != self.config.composer_vocab_size:
            raise ValueError(
                "config.composer_vocab_size must be same as the number of keys in "
                f"generation_config.composer_to_feature_token! "
                f"Found {self.config.composer_vocab_size} vs {len(generation_config.composer_to_feature_token)}."
            )

        # to control the variation of generated MIDI tokens we concatenate mel-conditioner tokens(which depends on composer_token)
        # at the front of input_features.
        input_features, attention_mask = self.get_mel_conditioner_outputs(
            input_features=input_features,
            attention_mask=attention_mask,
            composer=composer,
            generation_config=generation_config,
        )

        return super().generate(
            inputs=None,
            inputs_embeds=input_features,
            attention_mask=attention_mask,
            generation_config=generation_config,
            **kwargs,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # cut decoder_input_ids if past is used
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

    def prepare_decoder_input_ids_from_labels(self, labels: mindspore.Tensor):
        return self._shift_right(labels)

    def _reorder_cache(self, past_key_values, beam_idx):
        # if decoder past is not included in output
        # speedy decoding is disabled and no need to reorder
        if past_key_values is None:
            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
            return past_key_values

        reordered_decoder_past = ()
        for layer_past_states in past_key_values:
            # get the correct batch idx from layer past batch dim
            # batch dim of `past` is at 2nd position
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # need to set correct `past` for each of the four key / value states
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(0, beam_idx),
                )

            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
                raise ValueError(
                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
                )
            if len(reordered_layer_past_states) != len(layer_past_states):
                raise ValueError(
                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
                )

            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
        return reordered_decoder_past

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoForConditionalGeneration.forward(input_ids=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None, encoder_outputs=None, past_key_values=None, inputs_embeds=None, input_features=None, decoder_inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the sequence classification/regression loss. Indices should be in [-100, 0, ..., config.vocab_size - 1]. All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size] Returns:

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    decoder_input_ids: Optional[mindspore.Tensor] = None,
    decoder_attention_mask: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    decoder_head_mask: Optional[mindspore.Tensor] = None,
    cross_attn_head_mask: Optional[mindspore.Tensor] = None,
    encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    input_features: Optional[mindspore.Tensor] = None,
    decoder_inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], Seq2SeqLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
        config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
        labels in `[0, ..., config.vocab_size]`
    Returns:
    """
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if inputs_embeds is not None and input_features is not None:
        raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them")
    elif input_features is not None and inputs_embeds is None:
        inputs_embeds = input_features

    # Encode if needed (training, first prediction pass)
    if encoder_outputs is None:
        # Convert encoder inputs in embeddings if needed
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
        encoder_outputs = BaseModelOutput(
            last_hidden_state=encoder_outputs[0],
            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
        )

    hidden_states = encoder_outputs[0]

    if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
        # get decoder inputs from shifting lm labels to the right
        decoder_input_ids = self._shift_right(labels)

    # Decode
    decoder_outputs = self.decoder(
        input_ids=decoder_input_ids,
        attention_mask=decoder_attention_mask,
        inputs_embeds=decoder_inputs_embeds,
        past_key_values=past_key_values,
        encoder_hidden_states=hidden_states,
        encoder_attention_mask=attention_mask,
        head_mask=decoder_head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = decoder_outputs[0]

    if self.config.tie_word_embeddings:
        # Rescale output before projecting on vocab
        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
        sequence_output = sequence_output * (self.model_dim**-0.5)

    lm_logits = self.lm_head(sequence_output)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))

    if not return_dict:
        output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
        return ((loss,) + output) if loss is not None else output

    return Seq2SeqLMOutput(
        loss=loss,
        logits=lm_logits,
        past_key_values=decoder_outputs.past_key_values,
        decoder_hidden_states=decoder_outputs.hidden_states,
        decoder_attentions=decoder_outputs.attentions,
        cross_attentions=decoder_outputs.cross_attentions,
        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
        encoder_hidden_states=encoder_outputs.hidden_states,
        encoder_attentions=encoder_outputs.attentions,
    )

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoForConditionalGeneration.generate(input_features, attention_mask=None, composer='composer1', generation_config=None, **kwargs)

Generates token ids for midi outputs.

Most generation-controlling parameters are set in generation_config which, if not passed, will be set to the model's default generation configuration. You can override any generation_config by passing the corresponding parameters to generate(), e.g. .generate(inputs, num_beams=4, do_sample=True). For an overview of generation strategies and code examples, check out the following guide.

PARAMETER DESCRIPTION
input_features

This is the featurized version of audio generated by Pop2PianoFeatureExtractor.

TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*

attention_mask

For batched generation input_features are padded to have the same shape across all examples. attention_mask helps to determine which areas were padded and which were not. - 1 for tokens that are not padded, - 0 for tokens that are padded.

DEFAULT: None

composer

This value is passed to Pop2PianoConcatEmbeddingToMel to generate different embeddings for each "composer". Please make sure that the composet value is present in composer_to_feature_token in generation_config. For an example please see https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .

TYPE: `str`, *optional*, defaults to `"composer1"` DEFAULT: 'composer1'

generation_config

The generation configuration to be used as base parametrization for the generation call. **kwargs passed to generate matching the attributes of generation_config will override them. If generation_config is not provided, the default will be used, which had the following loading priority: 1) from the generation_config.json model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [~generation.GenerationConfig]'s default values, whose documentation should be checked to parameterize generation.

TYPE: `~generation.GenerationConfig`, *optional* DEFAULT: None

kwargs

Ad hoc parametrization of generate_config and/or additional model-specific kwargs that will be forwarded to the forward function of the model. If the model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with decoder_.

DEFAULT: {}

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
@no_grad()
def generate(
    self,
    input_features,
    attention_mask=None,
    composer="composer1",
    generation_config=None,
    **kwargs,
):
    """
    Generates token ids for midi outputs.

    <Tip warning={true}>

    Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
    model's default generation configuration. You can override any `generation_config` by passing the corresponding
    parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
    strategies and code examples, check out the [following guide](./generation_strategies).

    </Tip>

    Parameters:
        input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
        attention_mask:
            For batched generation `input_features` are padded to have the same shape across all examples.
            `attention_mask` helps to determine which areas were padded and which were not.
            - 1 for tokens that are **not padded**,
            - 0 for tokens that are **padded**.
        composer (`str`, *optional*, defaults to `"composer1"`):
            This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
            `"composer"`. Please make sure that the composet value is present in `composer_to_feature_token` in
            `generation_config`. For an example please see
            https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
        generation_config (`~generation.GenerationConfig`, *optional*):
            The generation configuration to be used as base parametrization for the generation call. `**kwargs`
            passed to generate matching the attributes of `generation_config` will override them. If
            `generation_config` is not provided, the default will be used, which had the following loading
            priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
            configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
            default values, whose documentation should be checked to parameterize generation.
        kwargs:
            Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
            forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
            specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
    Return:
        [`~utils.ModelOutput`] or `mindspore.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
        or when `config.return_dict_in_generate=True`) or a `mindspore.Tensor`.
            Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
            [`~utils.ModelOutput`] types are:
                - [`~generation.GenerateEncoderDecoderOutput`],
                - [`~generation.GenerateBeamEncoderDecoderOutput`]
    """

    if generation_config is None:
        generation_config = self.generation_config
    generation_config.update(**kwargs)

    # check for composer_to_feature_token
    if not hasattr(generation_config, "composer_to_feature_token"):
        raise ValueError(
            "`composer_to_feature_token` was not found! Please refer to "
            "https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json"
            "and parse a dict like that."
        )

    if len(generation_config.composer_to_feature_token) != self.config.composer_vocab_size:
        raise ValueError(
            "config.composer_vocab_size must be same as the number of keys in "
            f"generation_config.composer_to_feature_token! "
            f"Found {self.config.composer_vocab_size} vs {len(generation_config.composer_to_feature_token)}."
        )

    # to control the variation of generated MIDI tokens we concatenate mel-conditioner tokens(which depends on composer_token)
    # at the front of input_features.
    input_features, attention_mask = self.get_mel_conditioner_outputs(
        input_features=input_features,
        attention_mask=attention_mask,
        composer=composer,
        generation_config=generation_config,
    )

    return super().generate(
        inputs=None,
        inputs_embeds=input_features,
        attention_mask=attention_mask,
        generation_config=generation_config,
        **kwargs,
    )

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoForConditionalGeneration.get_mel_conditioner_outputs(input_features, composer, generation_config, attention_mask=None)

This method is used to concatenate mel conditioner tokens at the front of the input_features in order to control the type of MIDI token generated by the model.

PARAMETER DESCRIPTION
input_features

input features extracted from the feature extractor.

TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`

composer

composer token which determines the type of MIDI tokens to be generated.

TYPE: `str`

generation_config

The generation is used to get the composer-feature_token pair.

TYPE: `~generation.GenerationConfig`

attention_mask

For batched generation input_features are padded to have the same shape across all examples. attention_mask helps to determine which areas were padded and which were not. - 1 for tokens that are not padded, - 0 for tokens that are padded.

TYPE: ``, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
def get_mel_conditioner_outputs(
    self,
    input_features: mindspore.Tensor,
    composer: str,
    generation_config: GenerationConfig,
    attention_mask: mindspore.Tensor = None,
):
    """
    This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
    control the type of MIDI token generated by the model.

    Args:
        input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            input features extracted from the feature extractor.
        composer (`str`):
            composer token which determines the type of MIDI tokens to be generated.
        generation_config (`~generation.GenerationConfig`):
            The generation is used to get the composer-feature_token pair.
        attention_mask (``, *optional*):
            For batched generation `input_features` are padded to have the same shape across all examples.
            `attention_mask` helps to determine which areas were padded and which were not.
            - 1 for tokens that are **not padded**,
            - 0 for tokens that are **padded**.
    """
    composer_to_feature_token = generation_config.composer_to_feature_token
    if composer not in composer_to_feature_token.keys():
        raise ValueError(
            f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
        )
    composer_value = composer_to_feature_token[composer]
    composer_value = mindspore.tensor(composer_value)
    composer_value = composer_value.tile((input_features.shape[0],))

    embedding_offset = min(composer_to_feature_token.values())

    input_features = self.mel_conditioner(
        feature=input_features,
        index_value=composer_value,
        embedding_offset=embedding_offset,
    )
    if attention_mask is not None:
        input_features[~attention_mask[:, 0].bool()] = 0.0

        # since self.mel_conditioner adds a new array at the front of inputs_embeds we need to do the same for attention_mask to keep the shapes same
        attention_mask = ops.concatenate([attention_mask[:, 0].view(-1, 1), attention_mask], dim=1)
        return input_features, attention_mask

    return input_features, None

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoLayerNorm

Bases: Module

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class Pop2PianoLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(ops.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
        # half-precision inputs is done in fp32

        variance = ops.mean(hidden_states.to(mindspore.float32).pow(2), -1, keepdim=True)
        hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [mindspore.float16, mindspore.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoLayerNorm.__init__(hidden_size, eps=1e-06)

Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
50
51
52
53
54
55
56
def __init__(self, hidden_size, eps=1e-6):
    """
    Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
    """
    super().__init__()
    self.weight = nn.Parameter(ops.ones(hidden_size))
    self.variance_epsilon = eps

mindnlp.transformers.models.pop2piano.modeling_pop2piano.Pop2PianoPreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\pop2piano\modeling_pop2piano.py
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
class Pop2PianoPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = Pop2PianoConfig
    base_model_prefix = "transformer"
    is_parallelizable = False
    supports_gradient_checkpointing = True
    _no_split_modules = ["Pop2PianoBlock"]
    _keep_in_fp32_modules = ["wo"]

    def _init_weights(self, module):
        """Initialize the weights"""
        factor = self.config.initializer_factor  # Used for testing weights initialization
        if isinstance(module, Pop2PianoLayerNorm):
            nn.init.constant_(module.weight, factor * 1.0)
        elif isinstance(module, Pop2PianoConcatEmbeddingToMel):
            nn.init.normal_(module.embedding.weight, mean=0.0, std=factor * 1.0)
        elif isinstance(module, Pop2PianoForConditionalGeneration):
            # Mesh TensorFlow embeddings initialization
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
            nn.init.normal_(module.shared.weight, mean=0.0, std=factor * 1.0)
            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
                nn.init.normal_(module.lm_head.weight, mean=0.0, std=factor * 1.0)
        elif isinstance(module, Pop2PianoDenseActDense):
            # Mesh TensorFlow FF initialization
            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
            nn.init.normal_(module.wi.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                nn.init.zeros_(module.wi.bias)
            nn.init.normal_(module.wo.weight, mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                nn.init.zeros_(module.wo.bias)
        elif isinstance(module, Pop2PianoDenseGatedActDense):
            nn.init.normal_(module.wi_0.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
                nn.init.zeros_(module.wi_0.bias)
            nn.init.normal_(module.wi_1.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
                nn.init.zeros_(module.wi_1.bias)
            nn.init.normal_(module.wo.weight, mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                nn.init.zero_(module.wo.bias)
        elif isinstance(module, Pop2PianoAttention):
            # Mesh TensorFlow attention initialization to avoid scaling before softmax
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
            d_model = self.config.d_model
            key_value_proj_dim = self.config.d_kv
            n_heads = self.config.num_heads
            nn.init.normal_(module.q.weight, mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
            nn.init.normal_(module.k.weight, mean=0.0, std=factor * (d_model**-0.5))
            nn.init.normal_(module.v.weight, mean=0.0, std=factor * (d_model**-0.5))
            nn.init.normal_(module.o.weight, mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
            if module.has_relative_attention_bias:
                nn.init.normal_(module.relative_attention_bias.weight, mean=0.0, std=factor * ((d_model) ** -0.5))

    def _shift_right(self, input_ids):
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        if decoder_start_token_id is None:
            raise ValueError(
                "self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id."
            )

        shifted_input_ids = ops.full(input_ids.shape[:-1] + (1,), decoder_start_token_id, dtype=input_ids.dtype)
        shifted_input_ids = ops.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)

        if pad_token_id is None:
            raise ValueError("self.model.config.pad_token_id has to be defined.")
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids = shifted_input_ids.masked_fill(shifted_input_ids == -100, pad_token_id)

        return shifted_input_ids

mindnlp.transformers.models.pop2piano.tokenization_pop2piano

Tokenization class for Pop2Piano.

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer

Bases: PreTrainedTokenizer

Constructs a Pop2Piano tokenizer. This tokenizer does not require training.

This tokenizer inherits from [PreTrainedTokenizer] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

PARAMETER DESCRIPTION
vocab

Path to the vocab file which contains the vocabulary.

TYPE: `str`

default_velocity

Determines the default velocity to be used while creating midi Notes.

TYPE: `int`, *optional*, defaults to 77 DEFAULT: 77

num_bars

Determines cutoff_time_idx in for each token.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
class Pop2PianoTokenizer(PreTrainedTokenizer):
    """
    Constructs a Pop2Piano tokenizer. This tokenizer does not require training.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab (`str`):
            Path to the vocab file which contains the vocabulary.
        default_velocity (`int`, *optional*, defaults to 77):
            Determines the default velocity to be used while creating midi Notes.
        num_bars (`int`, *optional*, defaults to 2):
            Determines cutoff_time_idx in for each token.
    """
    model_input_names = ["token_ids", "attention_mask"]
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    def __init__(
        self,
        vocab,
        default_velocity=77,
        num_bars=2,
        unk_token="-1",
        eos_token="1",
        pad_token="0",
        bos_token="2",
        **kwargs,
    ):
        """
        This method initializes an instance of the Pop2PianoTokenizer class.

        Args:
            self: The instance of the Pop2PianoTokenizer class.
            vocab (str): The path to the vocabulary file.
            default_velocity (int): The default velocity for the tokenizer, default value is 77.
            num_bars (int): The number of bars.
            unk_token (str or AddedToken): The unknown token for the tokenizer.
                If str, it will be converted to an AddedToken.
            eos_token (str or AddedToken): The end-of-sequence token for the tokenizer.
                If str, it will be converted to an AddedToken.
            pad_token (str or AddedToken): The padding token for the tokenizer.
                If str, it will be converted to an AddedToken.
            bos_token (str or AddedToken): The beginning-of-sequence token for the tokenizer.
                If str, it will be converted to an AddedToken.

        Returns:
            None.

        Raises:
            FileNotFoundError: If the 'vocab' file is not found.
            JSONDecodeError: If there is an error decoding the JSON data from the 'vocab' file.
        """
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token

        self.default_velocity = default_velocity
        self.num_bars = num_bars

        # Load the vocab
        with open(vocab, "rb") as file:
            self.encoder = json.load(file)

        # create mappings for encoder
        self.decoder = {v: k for k, v in self.encoder.items()}

        super().__init__(
            unk_token=unk_token,
            eos_token=eos_token,
            pad_token=pad_token,
            bos_token=bos_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        """Returns the vocabulary size of the tokenizer."""
        return len(self.encoder)

    def get_vocab(self):
        """Returns the vocabulary of the tokenizer"""
        return dict(self.encoder, **self.added_tokens_encoder)

    def _convert_id_to_token(self, token_id: int) -> list:
        """
        Decodes the token ids generated by the transformer into notes.

        Args:
            token_id (`int`):
                This denotes the ids generated by the transformers to be converted to Midi tokens.

        Returns:
            `List`: A list consists of token_type (`str`) and value (`int`).
        """
        token_type_value = self.decoder.get(token_id, f"{self.unk_token}_TOKEN_TIME")
        token_type_value = token_type_value.split("_")
        token_type, value = "_".join(token_type_value[1:]), int(token_type_value[0])

        return [token_type, value]

    def _convert_token_to_id(self, token, token_type="TOKEN_TIME") -> int:
        """
        Encodes the Midi tokens to transformer generated token ids.

        Args:
            token (`int`):
                This denotes the token value.
            token_type (`str`):
                This denotes the type of the token. There are four types of midi tokens such as "TOKEN_TIME",
                "TOKEN_VELOCITY", "TOKEN_NOTE" and "TOKEN_SPECIAL".

        Returns:
            `int`: returns the id of the token.
        """
        return self.encoder.get(f"{token}_{token_type}", int(self.unk_token))

    def relative_batch_tokens_ids_to_notes(
        self,
        tokens: np.ndarray,
        beat_offset_idx: int,
        bars_per_batch:int,
        cutoff_time_idx: int,
    ):
        """
        Converts relative tokens to notes which are then used to generate pretty midi object.

        Args:
            tokens (`numpy.ndarray`):
                Tokens to be converted to notes.
            beat_offset_idx (`int`):
                Denotes beat offset index for each note in generated Midi.
            bars_per_batch (`int`):
                A parameter to control the Midi output generation.
            cutoff_time_idx (`int`):
                Denotes the cutoff time index for each note in generated Midi.
        """
        notes = None

        for index in range(len(tokens)):
            _tokens = tokens[index]
            _start_idx = beat_offset_idx + index * bars_per_batch * 4
            _cutoff_time_idx = cutoff_time_idx + _start_idx
            _notes = self.relative_tokens_ids_to_notes(
                _tokens,
                start_idx=_start_idx,
                cutoff_time_idx=_cutoff_time_idx,
            )

            if len(_notes) == 0:
                pass
            elif notes is None:
                notes = _notes
            else:
                notes = np.concatenate((notes, _notes), axis=0)

        if notes is None:
            return []
        return notes

    def relative_batch_tokens_ids_to_midi(
        self,
        tokens: np.ndarray,
        beatstep: np.ndarray,
        beat_offset_idx: int = 0,
        bars_per_batch: int = 2,
        cutoff_time_idx: int = 12,
    ):
        """
        Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens
        to notes then uses `notes_to_midi` method to convert them to Midi.

        Args:
            tokens (`numpy.ndarray`):
                Denotes tokens which alongside beatstep will be converted to Midi.
            beatstep (`np.ndarray`):
                We get beatstep from feature extractor which is also used to get Midi.
            beat_offset_idx (`int`, *optional*, defaults to 0):
                Denotes beat offset index for each note in generated Midi.
            bars_per_batch (`int`, *optional*, defaults to 2):
                A parameter to control the Midi output generation.
            cutoff_time_idx (`int`, *optional*, defaults to 12):
                Denotes the cutoff time index for each note in generated Midi.
        """
        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
        notes = self.relative_batch_tokens_ids_to_notes(
            tokens=tokens,
            beat_offset_idx=beat_offset_idx,
            bars_per_batch=bars_per_batch,
            cutoff_time_idx=cutoff_time_idx,
        )
        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
        return midi

    # Taken from the original code
    # Please see https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L257
    def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
        """
        Converts relative tokens to notes which will then be used to create Pretty Midi objects.

        Args:
            tokens (`numpy.ndarray`):
                Relative Tokens which will be converted to notes.
            start_idx (`float`):
                A parameter which denotes the starting index.
            cutoff_time_idx (`float`, *optional*):
                A parameter used while converting tokens to notes.
        """
        words = [self._convert_id_to_token(token) for token in tokens]

        current_idx = start_idx
        current_velocity = 0
        note_onsets_ready = [None for i in range(sum(k.endswith("NOTE") for k in self.encoder.keys()) + 1)]
        notes = []
        for token_type, number in words:
            if token_type == "TOKEN_SPECIAL":
                if number == 1:
                    break
            elif token_type == "TOKEN_TIME":
                current_idx = token_time_to_note(
                    number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx
                )
            elif token_type == "TOKEN_VELOCITY":
                current_velocity = number

            elif token_type == "TOKEN_NOTE":
                notes = token_note_to_note(
                    number=number,
                    current_velocity=current_velocity,
                    default_velocity=self.default_velocity,
                    note_onsets_ready=note_onsets_ready,
                    current_idx=current_idx,
                    notes=notes,
                )
            else:
                raise ValueError("Token type not understood!")

        for pitch, note_onset in enumerate(note_onsets_ready):
            # force offset if no offset for each pitch
            if note_onset is not None:
                if cutoff_time_idx is None:
                    cutoff = note_onset + 1
                else:
                    cutoff = max(cutoff_time_idx, note_onset + 1)

                offset_idx = max(current_idx, cutoff)
                notes.append([note_onset, offset_idx, pitch, self.default_velocity])

        if len(notes) == 0:
            return []

        notes = np.array(notes)
        note_order = notes[:, 0] * 128 + notes[:, 1]
        notes = notes[note_order.argsort()]
        return notes

    def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0):
        """
        Converts notes to Midi.

        Args:
            notes (`numpy.ndarray`):
                This is used to create Pretty Midi objects.
            beatstep (`numpy.ndarray`):
                This is the extrapolated beatstep that we get from feature extractor.
            offset_sec (`int`, *optional*, defaults to 0.0):
                This represents the offset seconds which is used while creating each Pretty Midi Note.
        """
        requires_backends(self, ["pretty_midi"])

        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
        new_inst = pretty_midi.Instrument(program=0)
        new_notes = []

        for onset_idx, offset_idx, pitch, velocity in notes:
            new_note = pretty_midi.Note(
                velocity=velocity,
                pitch=pitch,
                start=beatstep[onset_idx] - offset_sec,
                end=beatstep[offset_idx] - offset_sec,
            )
            new_notes.append(new_note)
        new_inst.notes = new_notes
        new_pm.instruments.append(new_inst)
        new_pm.remove_invalid_notes()
        return new_pm

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Saves the tokenizer's vocabulary dictionary to the provided save_directory.

        Args:
            save_directory (`str`):
                A path to the directory where to saved. It will be created if it doesn't exist.
            filename_prefix (`Optional[str]`, *optional*):
                A prefix to add to the names of the files saved by the tokenizer.
        """
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return None

        # Save the encoder.
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
        )
        with open(out_vocab_file, "w") as file:
            file.write(json.dumps(self.encoder))

        return (out_vocab_file,)

    def encode_plus(
        self,
        notes: Union[np.ndarray, List["pretty_midi.Note"]],
        truncation_strategy: Optional[TruncationStrategy] = None,
        max_length: Optional[int] = None,
        **kwargs,
    ) -> BatchEncoding:
        r"""
        This is the `encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
        generated token ids. It only works on a single batch, to process multiple batches please use
        `batch_encode_plus` or `__call__` method.

        Args:
            notes (`numpy.ndarray` of shape `[sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
                This represents the midi notes. If `notes` is a `numpy.ndarray`:

                - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

                If `notes` is a `list` containing `pretty_midi.Note` objects:

                - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
                Indicates the truncation strategy that is going to be used during truncation.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).

        Returns:
            `BatchEncoding` containing the tokens ids.
        """
        requires_backends(self, ["pretty_midi"])

        # check if notes is a pretty_midi object or not, if yes then extract the attributes and put them into a numpy
        # array.
        if isinstance(notes[0], pretty_midi.Note):
            notes = np.array(
                [[each_note.start, each_note.end, each_note.pitch, each_note.velocity] for each_note in notes]
            ).reshape(-1, 4)

        # to round up all the values to the closest int values.
        notes = np.round(notes).astype(np.int32)
        max_time_idx = notes[:, :2].max()

        times = [[] for i in range((max_time_idx + 1))]
        for onset, offset, pitch, velocity in notes:
            times[onset].append([pitch, velocity])
            times[offset].append([pitch, 0])

        tokens = []
        current_velocity = 0
        for i, time in enumerate(times):
            if len(time) == 0:
                continue
            tokens.append(self._convert_token_to_id(i, "TOKEN_TIME"))
            for pitch, velocity in time:
                velocity = int(velocity > 0)
                if current_velocity != velocity:
                    current_velocity = velocity
                    tokens.append(self._convert_token_to_id(velocity, "TOKEN_VELOCITY"))
                tokens.append(self._convert_token_to_id(pitch, "TOKEN_NOTE"))

        total_len = len(tokens)

        # truncation
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            tokens, _, _ = self.truncate_sequences(
                ids=tokens,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                **kwargs,
            )

        return BatchEncoding({"token_ids": tokens})

    def batch_encode_plus(
        self,
        notes: Union[np.ndarray, List["pretty_midi.Note"]],
        truncation_strategy: Optional[TruncationStrategy] = None,
        max_length: Optional[int] = None,
        **kwargs,
    ) -> BatchEncoding:
        r"""
        This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
        generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop.

        Args:
            notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
                This represents the midi notes. If `notes` is a `numpy.ndarray`:

                - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

                If `notes` is a `list` containing `pretty_midi.Note` objects:

                - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
                Indicates the truncation strategy that is going to be used during truncation.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).

        Returns:
            `BatchEncoding` containing the tokens ids.
        """
        encoded_batch_token_ids = []
        for i in range(len(notes)):
            encoded_batch_token_ids.append(
                self.encode_plus(
                    notes[i],
                    truncation_strategy=truncation_strategy,
                    max_length=max_length,
                    **kwargs,
                )["token_ids"]
            )

        return BatchEncoding({"token_ids": encoded_batch_token_ids})

    def __call__(
        self,
        notes: Union[
            np.ndarray,
            List["pretty_midi.Note"],
            List[List["pretty_midi.Note"]],
        ],
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        r"""
        This is the `__call__` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer generated
        token ids.

        Args:
            notes (`numpy.ndarray` of shape `[batch_size, max_sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
                This represents the midi notes.

                If `notes` is a `numpy.ndarray`:

                Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

                If `notes` is a `list` containing `pretty_midi.Note` objects:

                - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                lengths).
            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                Activates and controls truncation. Accepts the following values:

                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                to the maximum acceptable input length for the model if that argument is not provided. This will
                truncate token by token, removing a token from the longest sequence in the pair if a pair of
                sequences (or a batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                maximum acceptable input length for the model if that argument is not provided. This will only
                truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                maximum acceptable input length for the model if that argument is not provided. This will only
                truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                greater than the model maximum admissible input size).
            max_length (`int`, *optional*):
                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
                truncation/padding to a maximum length will be deactivated.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the `return_outputs` attribute.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.

        Returns:
            `BatchEncoding` containing the token_ids.
        """
        # check if it is batched or not
        # it is batched if its a list containing a list of `pretty_midi.Notes` where the outer list contains all the
        # batches and the inner list contains all Notes for a single batch. Otherwise if np.ndarray is passed it will be
        # considered batched if it has shape of `[batch_size, seqence_length, 4]` or ndim=3.
        is_batched = notes.ndim == 3 if isinstance(notes, np.ndarray) else isinstance(notes[0], list)

        # get the truncation and padding strategy
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        if is_batched:
            # If the user has not explicitly mentioned `return_attention_mask` as False, we change it to True
            return_attention_mask = True if return_attention_mask is None else return_attention_mask
            token_ids = self.batch_encode_plus(
                notes=notes,
                truncation_strategy=truncation_strategy,
                max_length=max_length,
                **kwargs,
            )
        else:
            token_ids = self.encode_plus(
                notes=notes,
                truncation_strategy=truncation_strategy,
                max_length=max_length,
                **kwargs,
            )

        # since we already have truncated sequnences we are just left to do padding
        token_ids = self.pad(
            token_ids,
            padding=padding_strategy,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        return token_ids

    def batch_decode(
        self,
        token_ids,
        feature_extractor_output: BatchFeature,
        return_midi: bool = True,
    ):
        r"""
        This is the `batch_decode` method for `Pop2PianoTokenizer`. It converts the token_ids generated by the
        transformer to midi_notes and returns them.

        Args:
            token_ids (`Union[np.ndarray, torch.Tensor, tf.Tensor]`):
                Output token_ids of `Pop2PianoConditionalGeneration` model.
            feature_extractor_output (`BatchFeature`):
                Denotes the output of `Pop2PianoFeatureExtractor.__call__`. It must contain `"beatstep"` and
                `"extrapolated_beatstep"`. Also `"attention_mask_beatsteps"` and
                `"attention_mask_extrapolated_beatstep"`
                 should be present if they were returned by the feature extractor.
            return_midi (`bool`, *optional*, defaults to `True`):
                Whether to return midi object or not.

        Returns:
            Conditional Return:
                If `return_midi` is True:

                - `BatchEncoding` containing both `notes` and `pretty_midi.pretty_midi.PrettyMIDI` objects.

                If `return_midi` is False:

                - `BatchEncoding` containing `notes`.
        """
        # check if they have attention_masks(attention_mask, attention_mask_beatsteps, attention_mask_extrapolated_beatstep) or not
        attention_masks_present = bool(
            hasattr(feature_extractor_output, "attention_mask")
            and hasattr(feature_extractor_output, "attention_mask_beatsteps")
            and hasattr(feature_extractor_output, "attention_mask_extrapolated_beatstep")
        )

        # if we are processing batched inputs then we must need attention_masks
        if not attention_masks_present and feature_extractor_output["beatsteps"].shape[0] > 1:
            raise ValueError(
                "attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep must be present "
                "for batched inputs! But one of them were not present."
            )

        # check for length mismatch between inputs_embeds, beatsteps and extrapolated_beatstep
        if attention_masks_present:
            # since we know about the number of examples in token_ids from attention_mask
            if (
                sum(feature_extractor_output["attention_mask"][:, 0] == 0)
                != feature_extractor_output["beatsteps"].shape[0]
                or feature_extractor_output["beatsteps"].shape[0]
                != feature_extractor_output["extrapolated_beatstep"].shape[0]
            ):
                raise ValueError(
                    "Length mistamtch between token_ids, beatsteps and extrapolated_beatstep! Found "
                    f"token_ids length - {token_ids.shape[0]}, beatsteps shape - {feature_extractor_output['beatsteps'].shape[0]} "
                    f"and extrapolated_beatsteps shape - {feature_extractor_output['extrapolated_beatstep'].shape[0]}"
                )
            if feature_extractor_output["attention_mask"].shape[0] != token_ids.shape[0]:
                raise ValueError(
                    f"Found attention_mask of length - {feature_extractor_output['attention_mask'].shape[0]} but token_ids of length - {token_ids.shape[0]}"
                )
        else:
            # if there is no attention mask present then it's surely a single example
            if (
                feature_extractor_output["beatsteps"].shape[0] != 1
                or feature_extractor_output["extrapolated_beatstep"].shape[0] != 1
            ):
                raise ValueError(
                    "Length mistamtch of beatsteps and extrapolated_beatstep! Since attention_mask is not present the number of examples must be 1, "
                    f"But found beatsteps length - {feature_extractor_output['beatsteps'].shape[0]}, extrapolated_beatsteps length - {feature_extractor_output['extrapolated_beatstep'].shape[0]}."
                )

        if attention_masks_present:
            # check for zeros(since token_ids are seperated by zero arrays)
            batch_idx = np.where(feature_extractor_output["attention_mask"][:, 0] == 0)[0]
        else:
            batch_idx = [token_ids.shape[0]]

        notes_list = []
        pretty_midi_objects_list = []
        start_idx = 0
        for index, end_idx in enumerate(batch_idx):
            each_tokens_ids = token_ids[start_idx:end_idx]
            # check where the whole example ended by searching for eos_token_id and getting the upper bound
            each_tokens_ids = each_tokens_ids[:, : int(np.max(np.where(each_tokens_ids == int(self.eos_token))[1])) + 1]
            beatsteps = feature_extractor_output["beatsteps"][index]
            extrapolated_beatstep = feature_extractor_output["extrapolated_beatstep"][index]

            # if attention mask is present then mask out real array/tensor
            if attention_masks_present:
                attention_mask_beatsteps = feature_extractor_output["attention_mask_beatsteps"][index]
                attention_mask_extrapolated_beatstep = feature_extractor_output[
                    "attention_mask_extrapolated_beatstep"
                ][index]
                beatsteps = beatsteps[: int(np.max(np.where(attention_mask_beatsteps == 1)[0])) + 1]
                extrapolated_beatstep = extrapolated_beatstep[
                    : int(np.max(np.where(attention_mask_extrapolated_beatstep == 1)[0])) + 1
                ]

            each_tokens_ids = to_numpy(each_tokens_ids)
            beatsteps = to_numpy(beatsteps)
            extrapolated_beatstep = to_numpy(extrapolated_beatstep)

            pretty_midi_object = self.relative_batch_tokens_ids_to_midi(
                tokens=each_tokens_ids,
                beatstep=extrapolated_beatstep,
                bars_per_batch=self.num_bars,
                cutoff_time_idx=(self.num_bars + 1) * 4,
            )

            for note in pretty_midi_object.instruments[0].notes:
                note.start += beatsteps[0]
                note.end += beatsteps[0]
                notes_list.append(note)

            pretty_midi_objects_list.append(pretty_midi_object)
            start_idx += end_idx + 1  # 1 represents the zero array

        if return_midi:
            return BatchEncoding({"notes": notes_list, "pretty_midi_objects": pretty_midi_objects_list})

        return BatchEncoding({"notes": notes_list})

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.vocab_size property

Returns the vocabulary size of the tokenizer.

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.__call__(notes, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, return_attention_mask=None, return_tensors=None, verbose=True, **kwargs)

This is the __call__ method for Pop2PianoTokenizer. It converts the midi notes to the transformer generated token ids.

PARAMETER DESCRIPTION
notes

This represents the midi notes.

If notes is a numpy.ndarray:

Each sequence must have 4 values, they are onset idx, offset idx, pitch and velocity.

If notes is a list containing pretty_midi.Note objects:

  • Each sequence must have 4 attributes, they are start, end, pitch and velocity.

TYPE: `numpy.ndarray` of shape `[batch_size, max_sequence_length, 4]` or `list` of `pretty_midi.Note` objects

padding

Activates and controls padding. Accepts the following values:

  • True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
  • 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
  • False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).

TYPE: `bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False` DEFAULT: False

truncation

Activates and controls truncation. Accepts the following values:

  • True or 'longest_first': Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided.
  • 'only_first': Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
  • 'only_second': Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
  • False or 'do_not_truncate' (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size).

TYPE: `bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False` DEFAULT: None

max_length

Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to None, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated.

TYPE: `int`, *optional* DEFAULT: None

pad_to_multiple_of

If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).

TYPE: `int`, *optional* DEFAULT: None

return_attention_mask

Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the return_outputs attribute.

What are attention masks?

TYPE: `bool`, *optional* DEFAULT: None

return_tensors

If set, will return tensors instead of list of python integers. Acceptable values are:

  • 'tf': Return TensorFlow tf.constant objects.
  • 'pt': Return PyTorch torch.Tensor objects.
  • 'np': Return Numpy np.ndarray objects.

TYPE: `str` or [`~file_utils.TensorType`], *optional* DEFAULT: None

verbose

Whether or not to print more information and warnings.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

RETURNS DESCRIPTION
BatchEncoding

BatchEncoding containing the token_ids.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
def __call__(
    self,
    notes: Union[
        np.ndarray,
        List["pretty_midi.Note"],
        List[List["pretty_midi.Note"]],
    ],
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = None,
    max_length: Optional[int] = None,
    pad_to_multiple_of: Optional[int] = None,
    return_attention_mask: Optional[bool] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    verbose: bool = True,
    **kwargs,
) -> BatchEncoding:
    r"""
    This is the `__call__` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer generated
    token ids.

    Args:
        notes (`numpy.ndarray` of shape `[batch_size, max_sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
            This represents the midi notes.

            If `notes` is a `numpy.ndarray`:

            Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

            If `notes` is a `list` containing `pretty_midi.Note` objects:

            - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
            Activates and controls padding. Accepts the following values:

            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
            sequence if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
            acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
            lengths).
        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
            Activates and controls truncation. Accepts the following values:

            - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
            to the maximum acceptable input length for the model if that argument is not provided. This will
            truncate token by token, removing a token from the longest sequence in the pair if a pair of
            sequences (or a batch of pairs) is provided.
            - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
            maximum acceptable input length for the model if that argument is not provided. This will only
            truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
            - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
            maximum acceptable input length for the model if that argument is not provided. This will only
            truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
            - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
            greater than the model maximum admissible input size).
        max_length (`int`, *optional*):
            Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
            `None`, this will use the predefined model maximum length if a maximum length is required by one of the
            truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
            truncation/padding to a maximum length will be deactivated.
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
            the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
        return_attention_mask (`bool`, *optional*):
            Whether to return the attention mask. If left to the default, will return the attention mask according
            to the specific tokenizer's default, defined by the `return_outputs` attribute.

            [What are attention masks?](../glossary#attention-mask)
        return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
            If set, will return tensors instead of list of python integers. Acceptable values are:

            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return Numpy `np.ndarray` objects.
        verbose (`bool`, *optional*, defaults to `True`):
            Whether or not to print more information and warnings.

    Returns:
        `BatchEncoding` containing the token_ids.
    """
    # check if it is batched or not
    # it is batched if its a list containing a list of `pretty_midi.Notes` where the outer list contains all the
    # batches and the inner list contains all Notes for a single batch. Otherwise if np.ndarray is passed it will be
    # considered batched if it has shape of `[batch_size, seqence_length, 4]` or ndim=3.
    is_batched = notes.ndim == 3 if isinstance(notes, np.ndarray) else isinstance(notes[0], list)

    # get the truncation and padding strategy
    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
        padding=padding,
        truncation=truncation,
        max_length=max_length,
        pad_to_multiple_of=pad_to_multiple_of,
        verbose=verbose,
        **kwargs,
    )

    if is_batched:
        # If the user has not explicitly mentioned `return_attention_mask` as False, we change it to True
        return_attention_mask = True if return_attention_mask is None else return_attention_mask
        token_ids = self.batch_encode_plus(
            notes=notes,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            **kwargs,
        )
    else:
        token_ids = self.encode_plus(
            notes=notes,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            **kwargs,
        )

    # since we already have truncated sequnences we are just left to do padding
    token_ids = self.pad(
        token_ids,
        padding=padding_strategy,
        max_length=max_length,
        pad_to_multiple_of=pad_to_multiple_of,
        return_attention_mask=return_attention_mask,
        return_tensors=return_tensors,
        verbose=verbose,
    )

    return token_ids

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.__init__(vocab, default_velocity=77, num_bars=2, unk_token='-1', eos_token='1', pad_token='0', bos_token='2', **kwargs)

This method initializes an instance of the Pop2PianoTokenizer class.

PARAMETER DESCRIPTION
self

The instance of the Pop2PianoTokenizer class.

vocab

The path to the vocabulary file.

TYPE: str

default_velocity

The default velocity for the tokenizer, default value is 77.

TYPE: int DEFAULT: 77

num_bars

The number of bars.

TYPE: int DEFAULT: 2

unk_token

The unknown token for the tokenizer. If str, it will be converted to an AddedToken.

TYPE: str or AddedToken DEFAULT: '-1'

eos_token

The end-of-sequence token for the tokenizer. If str, it will be converted to an AddedToken.

TYPE: str or AddedToken DEFAULT: '1'

pad_token

The padding token for the tokenizer. If str, it will be converted to an AddedToken.

TYPE: str or AddedToken DEFAULT: '0'

bos_token

The beginning-of-sequence token for the tokenizer. If str, it will be converted to an AddedToken.

TYPE: str or AddedToken DEFAULT: '2'

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
FileNotFoundError

If the 'vocab' file is not found.

JSONDecodeError

If there is an error decoding the JSON data from the 'vocab' file.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def __init__(
    self,
    vocab,
    default_velocity=77,
    num_bars=2,
    unk_token="-1",
    eos_token="1",
    pad_token="0",
    bos_token="2",
    **kwargs,
):
    """
    This method initializes an instance of the Pop2PianoTokenizer class.

    Args:
        self: The instance of the Pop2PianoTokenizer class.
        vocab (str): The path to the vocabulary file.
        default_velocity (int): The default velocity for the tokenizer, default value is 77.
        num_bars (int): The number of bars.
        unk_token (str or AddedToken): The unknown token for the tokenizer.
            If str, it will be converted to an AddedToken.
        eos_token (str or AddedToken): The end-of-sequence token for the tokenizer.
            If str, it will be converted to an AddedToken.
        pad_token (str or AddedToken): The padding token for the tokenizer.
            If str, it will be converted to an AddedToken.
        bos_token (str or AddedToken): The beginning-of-sequence token for the tokenizer.
            If str, it will be converted to an AddedToken.

    Returns:
        None.

    Raises:
        FileNotFoundError: If the 'vocab' file is not found.
        JSONDecodeError: If there is an error decoding the JSON data from the 'vocab' file.
    """
    unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
    eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
    pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
    bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token

    self.default_velocity = default_velocity
    self.num_bars = num_bars

    # Load the vocab
    with open(vocab, "rb") as file:
        self.encoder = json.load(file)

    # create mappings for encoder
    self.decoder = {v: k for k, v in self.encoder.items()}

    super().__init__(
        unk_token=unk_token,
        eos_token=eos_token,
        pad_token=pad_token,
        bos_token=bos_token,
        **kwargs,
    )

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.batch_decode(token_ids, feature_extractor_output, return_midi=True)

This is the batch_decode method for Pop2PianoTokenizer. It converts the token_ids generated by the transformer to midi_notes and returns them.

PARAMETER DESCRIPTION
token_ids

Output token_ids of Pop2PianoConditionalGeneration model.

TYPE: `Union[np.ndarray, torch.Tensor, tf.Tensor]`

feature_extractor_output

Denotes the output of Pop2PianoFeatureExtractor.__call__. It must contain "beatstep" and "extrapolated_beatstep". Also "attention_mask_beatsteps" and "attention_mask_extrapolated_beatstep" should be present if they were returned by the feature extractor.

TYPE: `BatchFeature`

return_midi

Whether to return midi object or not.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

RETURNS DESCRIPTION

Conditional Return: If return_midi is True:

  • BatchEncoding containing both notes and pretty_midi.pretty_midi.PrettyMIDI objects.

If return_midi is False:

  • BatchEncoding containing notes.
Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
def batch_decode(
    self,
    token_ids,
    feature_extractor_output: BatchFeature,
    return_midi: bool = True,
):
    r"""
    This is the `batch_decode` method for `Pop2PianoTokenizer`. It converts the token_ids generated by the
    transformer to midi_notes and returns them.

    Args:
        token_ids (`Union[np.ndarray, torch.Tensor, tf.Tensor]`):
            Output token_ids of `Pop2PianoConditionalGeneration` model.
        feature_extractor_output (`BatchFeature`):
            Denotes the output of `Pop2PianoFeatureExtractor.__call__`. It must contain `"beatstep"` and
            `"extrapolated_beatstep"`. Also `"attention_mask_beatsteps"` and
            `"attention_mask_extrapolated_beatstep"`
             should be present if they were returned by the feature extractor.
        return_midi (`bool`, *optional*, defaults to `True`):
            Whether to return midi object or not.

    Returns:
        Conditional Return:
            If `return_midi` is True:

            - `BatchEncoding` containing both `notes` and `pretty_midi.pretty_midi.PrettyMIDI` objects.

            If `return_midi` is False:

            - `BatchEncoding` containing `notes`.
    """
    # check if they have attention_masks(attention_mask, attention_mask_beatsteps, attention_mask_extrapolated_beatstep) or not
    attention_masks_present = bool(
        hasattr(feature_extractor_output, "attention_mask")
        and hasattr(feature_extractor_output, "attention_mask_beatsteps")
        and hasattr(feature_extractor_output, "attention_mask_extrapolated_beatstep")
    )

    # if we are processing batched inputs then we must need attention_masks
    if not attention_masks_present and feature_extractor_output["beatsteps"].shape[0] > 1:
        raise ValueError(
            "attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep must be present "
            "for batched inputs! But one of them were not present."
        )

    # check for length mismatch between inputs_embeds, beatsteps and extrapolated_beatstep
    if attention_masks_present:
        # since we know about the number of examples in token_ids from attention_mask
        if (
            sum(feature_extractor_output["attention_mask"][:, 0] == 0)
            != feature_extractor_output["beatsteps"].shape[0]
            or feature_extractor_output["beatsteps"].shape[0]
            != feature_extractor_output["extrapolated_beatstep"].shape[0]
        ):
            raise ValueError(
                "Length mistamtch between token_ids, beatsteps and extrapolated_beatstep! Found "
                f"token_ids length - {token_ids.shape[0]}, beatsteps shape - {feature_extractor_output['beatsteps'].shape[0]} "
                f"and extrapolated_beatsteps shape - {feature_extractor_output['extrapolated_beatstep'].shape[0]}"
            )
        if feature_extractor_output["attention_mask"].shape[0] != token_ids.shape[0]:
            raise ValueError(
                f"Found attention_mask of length - {feature_extractor_output['attention_mask'].shape[0]} but token_ids of length - {token_ids.shape[0]}"
            )
    else:
        # if there is no attention mask present then it's surely a single example
        if (
            feature_extractor_output["beatsteps"].shape[0] != 1
            or feature_extractor_output["extrapolated_beatstep"].shape[0] != 1
        ):
            raise ValueError(
                "Length mistamtch of beatsteps and extrapolated_beatstep! Since attention_mask is not present the number of examples must be 1, "
                f"But found beatsteps length - {feature_extractor_output['beatsteps'].shape[0]}, extrapolated_beatsteps length - {feature_extractor_output['extrapolated_beatstep'].shape[0]}."
            )

    if attention_masks_present:
        # check for zeros(since token_ids are seperated by zero arrays)
        batch_idx = np.where(feature_extractor_output["attention_mask"][:, 0] == 0)[0]
    else:
        batch_idx = [token_ids.shape[0]]

    notes_list = []
    pretty_midi_objects_list = []
    start_idx = 0
    for index, end_idx in enumerate(batch_idx):
        each_tokens_ids = token_ids[start_idx:end_idx]
        # check where the whole example ended by searching for eos_token_id and getting the upper bound
        each_tokens_ids = each_tokens_ids[:, : int(np.max(np.where(each_tokens_ids == int(self.eos_token))[1])) + 1]
        beatsteps = feature_extractor_output["beatsteps"][index]
        extrapolated_beatstep = feature_extractor_output["extrapolated_beatstep"][index]

        # if attention mask is present then mask out real array/tensor
        if attention_masks_present:
            attention_mask_beatsteps = feature_extractor_output["attention_mask_beatsteps"][index]
            attention_mask_extrapolated_beatstep = feature_extractor_output[
                "attention_mask_extrapolated_beatstep"
            ][index]
            beatsteps = beatsteps[: int(np.max(np.where(attention_mask_beatsteps == 1)[0])) + 1]
            extrapolated_beatstep = extrapolated_beatstep[
                : int(np.max(np.where(attention_mask_extrapolated_beatstep == 1)[0])) + 1
            ]

        each_tokens_ids = to_numpy(each_tokens_ids)
        beatsteps = to_numpy(beatsteps)
        extrapolated_beatstep = to_numpy(extrapolated_beatstep)

        pretty_midi_object = self.relative_batch_tokens_ids_to_midi(
            tokens=each_tokens_ids,
            beatstep=extrapolated_beatstep,
            bars_per_batch=self.num_bars,
            cutoff_time_idx=(self.num_bars + 1) * 4,
        )

        for note in pretty_midi_object.instruments[0].notes:
            note.start += beatsteps[0]
            note.end += beatsteps[0]
            notes_list.append(note)

        pretty_midi_objects_list.append(pretty_midi_object)
        start_idx += end_idx + 1  # 1 represents the zero array

    if return_midi:
        return BatchEncoding({"notes": notes_list, "pretty_midi_objects": pretty_midi_objects_list})

    return BatchEncoding({"notes": notes_list})

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.batch_encode_plus(notes, truncation_strategy=None, max_length=None, **kwargs)

This is the batch_encode_plus method for Pop2PianoTokenizer. It converts the midi notes to the transformer generated token ids. It works on multiple batches by calling encode_plus multiple times in a loop.

PARAMETER DESCRIPTION
notes

This represents the midi notes. If notes is a numpy.ndarray:

  • Each sequence must have 4 values, they are onset idx, offset idx, pitch and velocity.

If notes is a list containing pretty_midi.Note objects:

  • Each sequence must have 4 attributes, they are start, end, pitch and velocity.

TYPE: `numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects

truncation_strategy

Indicates the truncation strategy that is going to be used during truncation.

TYPE: [`~tokenization_utils_base.TruncationStrategy`], *optional* DEFAULT: None

max_length

Maximum length of the returned list and optionally padding length (see above).

TYPE: `int`, *optional* DEFAULT: None

RETURNS DESCRIPTION
BatchEncoding

BatchEncoding containing the tokens ids.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
def batch_encode_plus(
    self,
    notes: Union[np.ndarray, List["pretty_midi.Note"]],
    truncation_strategy: Optional[TruncationStrategy] = None,
    max_length: Optional[int] = None,
    **kwargs,
) -> BatchEncoding:
    r"""
    This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
    generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop.

    Args:
        notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
            This represents the midi notes. If `notes` is a `numpy.ndarray`:

            - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

            If `notes` is a `list` containing `pretty_midi.Note` objects:

            - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
        truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
            Indicates the truncation strategy that is going to be used during truncation.
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).

    Returns:
        `BatchEncoding` containing the tokens ids.
    """
    encoded_batch_token_ids = []
    for i in range(len(notes)):
        encoded_batch_token_ids.append(
            self.encode_plus(
                notes[i],
                truncation_strategy=truncation_strategy,
                max_length=max_length,
                **kwargs,
            )["token_ids"]
        )

    return BatchEncoding({"token_ids": encoded_batch_token_ids})

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.encode_plus(notes, truncation_strategy=None, max_length=None, **kwargs)

This is the encode_plus method for Pop2PianoTokenizer. It converts the midi notes to the transformer generated token ids. It only works on a single batch, to process multiple batches please use batch_encode_plus or __call__ method.

PARAMETER DESCRIPTION
notes

This represents the midi notes. If notes is a numpy.ndarray:

  • Each sequence must have 4 values, they are onset idx, offset idx, pitch and velocity.

If notes is a list containing pretty_midi.Note objects:

  • Each sequence must have 4 attributes, they are start, end, pitch and velocity.

TYPE: `numpy.ndarray` of shape `[sequence_length, 4]` or `list` of `pretty_midi.Note` objects

truncation_strategy

Indicates the truncation strategy that is going to be used during truncation.

TYPE: [`~tokenization_utils_base.TruncationStrategy`], *optional* DEFAULT: None

max_length

Maximum length of the returned list and optionally padding length (see above).

TYPE: `int`, *optional* DEFAULT: None

RETURNS DESCRIPTION
BatchEncoding

BatchEncoding containing the tokens ids.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
def encode_plus(
    self,
    notes: Union[np.ndarray, List["pretty_midi.Note"]],
    truncation_strategy: Optional[TruncationStrategy] = None,
    max_length: Optional[int] = None,
    **kwargs,
) -> BatchEncoding:
    r"""
    This is the `encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
    generated token ids. It only works on a single batch, to process multiple batches please use
    `batch_encode_plus` or `__call__` method.

    Args:
        notes (`numpy.ndarray` of shape `[sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
            This represents the midi notes. If `notes` is a `numpy.ndarray`:

            - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.

            If `notes` is a `list` containing `pretty_midi.Note` objects:

            - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
        truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
            Indicates the truncation strategy that is going to be used during truncation.
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).

    Returns:
        `BatchEncoding` containing the tokens ids.
    """
    requires_backends(self, ["pretty_midi"])

    # check if notes is a pretty_midi object or not, if yes then extract the attributes and put them into a numpy
    # array.
    if isinstance(notes[0], pretty_midi.Note):
        notes = np.array(
            [[each_note.start, each_note.end, each_note.pitch, each_note.velocity] for each_note in notes]
        ).reshape(-1, 4)

    # to round up all the values to the closest int values.
    notes = np.round(notes).astype(np.int32)
    max_time_idx = notes[:, :2].max()

    times = [[] for i in range((max_time_idx + 1))]
    for onset, offset, pitch, velocity in notes:
        times[onset].append([pitch, velocity])
        times[offset].append([pitch, 0])

    tokens = []
    current_velocity = 0
    for i, time in enumerate(times):
        if len(time) == 0:
            continue
        tokens.append(self._convert_token_to_id(i, "TOKEN_TIME"))
        for pitch, velocity in time:
            velocity = int(velocity > 0)
            if current_velocity != velocity:
                current_velocity = velocity
                tokens.append(self._convert_token_to_id(velocity, "TOKEN_VELOCITY"))
            tokens.append(self._convert_token_to_id(pitch, "TOKEN_NOTE"))

    total_len = len(tokens)

    # truncation
    if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
        tokens, _, _ = self.truncate_sequences(
            ids=tokens,
            num_tokens_to_remove=total_len - max_length,
            truncation_strategy=truncation_strategy,
            **kwargs,
        )

    return BatchEncoding({"token_ids": tokens})

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.get_vocab()

Returns the vocabulary of the tokenizer

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
180
181
182
def get_vocab(self):
    """Returns the vocabulary of the tokenizer"""
    return dict(self.encoder, **self.added_tokens_encoder)

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.notes_to_midi(notes, beatstep, offset_sec=0.0)

Converts notes to Midi.

PARAMETER DESCRIPTION
notes

This is used to create Pretty Midi objects.

TYPE: `numpy.ndarray`

beatstep

This is the extrapolated beatstep that we get from feature extractor.

TYPE: `numpy.ndarray`

offset_sec

This represents the offset seconds which is used while creating each Pretty Midi Note.

TYPE: `int`, *optional*, defaults to 0.0 DEFAULT: 0.0

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0):
    """
    Converts notes to Midi.

    Args:
        notes (`numpy.ndarray`):
            This is used to create Pretty Midi objects.
        beatstep (`numpy.ndarray`):
            This is the extrapolated beatstep that we get from feature extractor.
        offset_sec (`int`, *optional*, defaults to 0.0):
            This represents the offset seconds which is used while creating each Pretty Midi Note.
    """
    requires_backends(self, ["pretty_midi"])

    new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
    new_inst = pretty_midi.Instrument(program=0)
    new_notes = []

    for onset_idx, offset_idx, pitch, velocity in notes:
        new_note = pretty_midi.Note(
            velocity=velocity,
            pitch=pitch,
            start=beatstep[onset_idx] - offset_sec,
            end=beatstep[offset_idx] - offset_sec,
        )
        new_notes.append(new_note)
    new_inst.notes = new_notes
    new_pm.instruments.append(new_inst)
    new_pm.remove_invalid_notes()
    return new_pm

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.relative_batch_tokens_ids_to_midi(tokens, beatstep, beat_offset_idx=0, bars_per_batch=2, cutoff_time_idx=12)

Converts tokens to Midi. This method calls relative_batch_tokens_ids_to_notes method to convert batch tokens to notes then uses notes_to_midi method to convert them to Midi.

PARAMETER DESCRIPTION
tokens

Denotes tokens which alongside beatstep will be converted to Midi.

TYPE: `numpy.ndarray`

beatstep

We get beatstep from feature extractor which is also used to get Midi.

TYPE: `np.ndarray`

beat_offset_idx

Denotes beat offset index for each note in generated Midi.

TYPE: `int`, *optional*, defaults to 0 DEFAULT: 0

bars_per_batch

A parameter to control the Midi output generation.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

cutoff_time_idx

Denotes the cutoff time index for each note in generated Midi.

TYPE: `int`, *optional*, defaults to 12 DEFAULT: 12

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def relative_batch_tokens_ids_to_midi(
    self,
    tokens: np.ndarray,
    beatstep: np.ndarray,
    beat_offset_idx: int = 0,
    bars_per_batch: int = 2,
    cutoff_time_idx: int = 12,
):
    """
    Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens
    to notes then uses `notes_to_midi` method to convert them to Midi.

    Args:
        tokens (`numpy.ndarray`):
            Denotes tokens which alongside beatstep will be converted to Midi.
        beatstep (`np.ndarray`):
            We get beatstep from feature extractor which is also used to get Midi.
        beat_offset_idx (`int`, *optional*, defaults to 0):
            Denotes beat offset index for each note in generated Midi.
        bars_per_batch (`int`, *optional*, defaults to 2):
            A parameter to control the Midi output generation.
        cutoff_time_idx (`int`, *optional*, defaults to 12):
            Denotes the cutoff time index for each note in generated Midi.
    """
    beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
    notes = self.relative_batch_tokens_ids_to_notes(
        tokens=tokens,
        beat_offset_idx=beat_offset_idx,
        bars_per_batch=bars_per_batch,
        cutoff_time_idx=cutoff_time_idx,
    )
    midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
    return midi

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.relative_batch_tokens_ids_to_notes(tokens, beat_offset_idx, bars_per_batch, cutoff_time_idx)

Converts relative tokens to notes which are then used to generate pretty midi object.

PARAMETER DESCRIPTION
tokens

Tokens to be converted to notes.

TYPE: `numpy.ndarray`

beat_offset_idx

Denotes beat offset index for each note in generated Midi.

TYPE: `int`

bars_per_batch

A parameter to control the Midi output generation.

TYPE: `int`

cutoff_time_idx

Denotes the cutoff time index for each note in generated Midi.

TYPE: `int`

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def relative_batch_tokens_ids_to_notes(
    self,
    tokens: np.ndarray,
    beat_offset_idx: int,
    bars_per_batch:int,
    cutoff_time_idx: int,
):
    """
    Converts relative tokens to notes which are then used to generate pretty midi object.

    Args:
        tokens (`numpy.ndarray`):
            Tokens to be converted to notes.
        beat_offset_idx (`int`):
            Denotes beat offset index for each note in generated Midi.
        bars_per_batch (`int`):
            A parameter to control the Midi output generation.
        cutoff_time_idx (`int`):
            Denotes the cutoff time index for each note in generated Midi.
    """
    notes = None

    for index in range(len(tokens)):
        _tokens = tokens[index]
        _start_idx = beat_offset_idx + index * bars_per_batch * 4
        _cutoff_time_idx = cutoff_time_idx + _start_idx
        _notes = self.relative_tokens_ids_to_notes(
            _tokens,
            start_idx=_start_idx,
            cutoff_time_idx=_cutoff_time_idx,
        )

        if len(_notes) == 0:
            pass
        elif notes is None:
            notes = _notes
        else:
            notes = np.concatenate((notes, _notes), axis=0)

    if notes is None:
        return []
    return notes

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.relative_tokens_ids_to_notes(tokens, start_idx, cutoff_time_idx=None)

Converts relative tokens to notes which will then be used to create Pretty Midi objects.

PARAMETER DESCRIPTION
tokens

Relative Tokens which will be converted to notes.

TYPE: `numpy.ndarray`

start_idx

A parameter which denotes the starting index.

TYPE: `float`

cutoff_time_idx

A parameter used while converting tokens to notes.

TYPE: `float`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
    """
    Converts relative tokens to notes which will then be used to create Pretty Midi objects.

    Args:
        tokens (`numpy.ndarray`):
            Relative Tokens which will be converted to notes.
        start_idx (`float`):
            A parameter which denotes the starting index.
        cutoff_time_idx (`float`, *optional*):
            A parameter used while converting tokens to notes.
    """
    words = [self._convert_id_to_token(token) for token in tokens]

    current_idx = start_idx
    current_velocity = 0
    note_onsets_ready = [None for i in range(sum(k.endswith("NOTE") for k in self.encoder.keys()) + 1)]
    notes = []
    for token_type, number in words:
        if token_type == "TOKEN_SPECIAL":
            if number == 1:
                break
        elif token_type == "TOKEN_TIME":
            current_idx = token_time_to_note(
                number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx
            )
        elif token_type == "TOKEN_VELOCITY":
            current_velocity = number

        elif token_type == "TOKEN_NOTE":
            notes = token_note_to_note(
                number=number,
                current_velocity=current_velocity,
                default_velocity=self.default_velocity,
                note_onsets_ready=note_onsets_ready,
                current_idx=current_idx,
                notes=notes,
            )
        else:
            raise ValueError("Token type not understood!")

    for pitch, note_onset in enumerate(note_onsets_ready):
        # force offset if no offset for each pitch
        if note_onset is not None:
            if cutoff_time_idx is None:
                cutoff = note_onset + 1
            else:
                cutoff = max(cutoff_time_idx, note_onset + 1)

            offset_idx = max(current_idx, cutoff)
            notes.append([note_onset, offset_idx, pitch, self.default_velocity])

    if len(notes) == 0:
        return []

    notes = np.array(notes)
    note_order = notes[:, 0] * 128 + notes[:, 1]
    notes = notes[note_order.argsort()]
    return notes

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.Pop2PianoTokenizer.save_vocabulary(save_directory, filename_prefix=None)

Saves the tokenizer's vocabulary dictionary to the provided save_directory.

PARAMETER DESCRIPTION
save_directory

A path to the directory where to saved. It will be created if it doesn't exist.

TYPE: `str`

filename_prefix

A prefix to add to the names of the files saved by the tokenizer.

TYPE: `Optional[str]`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
    """
    Saves the tokenizer's vocabulary dictionary to the provided save_directory.

    Args:
        save_directory (`str`):
            A path to the directory where to saved. It will be created if it doesn't exist.
        filename_prefix (`Optional[str]`, *optional*):
            A prefix to add to the names of the files saved by the tokenizer.
    """
    if not os.path.isdir(save_directory):
        logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
        return None

    # Save the encoder.
    out_vocab_file = os.path.join(
        save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
    )
    with open(out_vocab_file, "w") as file:
        file.write(json.dumps(self.encoder))

    return (out_vocab_file,)

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes)

This function updates the notes list based on the given parameters.

PARAMETER DESCRIPTION
number

The number of the note.

TYPE: int

current_velocity

The current velocity of the note.

TYPE: int

default_velocity

The default velocity for the note.

TYPE: int

note_onsets_ready

A list containing the onset index for each note. If an onset index is None, it means that the note has not yet started.

TYPE: list or None

current_idx

The current index.

TYPE: int

notes

A list containing the notes and their properties.

TYPE: list

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes):
    """
    This function updates the notes list based on the given parameters.

    Args:
        number (int): The number of the note.
        current_velocity (int): The current velocity of the note.
        default_velocity (int): The default velocity for the note.
        note_onsets_ready (list or None): A list containing the onset index for each note.
            If an onset index is None, it means that the note has not yet started.
        current_idx (int): The current index.
        notes (list): A list containing the notes and their properties.

    Returns:
        None.

    Raises:
        None.
    """
    if note_onsets_ready[number] is not None:
        # offset with onset
        onset_idx = note_onsets_ready[number]
        if onset_idx < current_idx:
            # Time shift after previous note_on
            offset_idx = current_idx
            notes.append([onset_idx, offset_idx, number, default_velocity])
            onsets_ready = None if current_velocity == 0 else current_idx
            note_onsets_ready[number] = onsets_ready
    else:
        note_onsets_ready[number] = current_idx
    return notes

mindnlp.transformers.models.pop2piano.tokenization_pop2piano.token_time_to_note(number, cutoff_time_idx, current_idx)

PARAMETER DESCRIPTION
number

The amount to increment the current index by.

TYPE: int

cutoff_time_idx

The maximum index value allowed, can be None.

TYPE: int or None

current_idx

The current index value.

TYPE: int

RETURNS DESCRIPTION
current_idx

The updated current index value, respecting the cutoff_time_idx if provided.

Source code in mindnlp\transformers\models\pop2piano\tokenization_pop2piano.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def token_time_to_note(number, cutoff_time_idx, current_idx):
    """

    Args:
        number (int): The amount to increment the current index by.
        cutoff_time_idx (int or None): The maximum index value allowed, can be None.
        current_idx (int): The current index value.

    Returns:
        current_idx: The updated current index value, respecting the cutoff_time_idx if provided.

    Raises:
        None
    """
    current_idx += number
    if cutoff_time_idx is not None:
        current_idx = min(current_idx, cutoff_time_idx)

    return current_idx

mindnlp.transformers.models.pop2piano.processing_pop2piano

Processor class for Pop2Piano.

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor

Bases: ProcessorMixin

Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single processor.

[Pop2PianoProcessor] offers all the functionalities of [Pop2PianoFeatureExtractor] and [Pop2PianoTokenizer]. See the docstring of [~Pop2PianoProcessor.__call__] and [~Pop2PianoProcessor.decode] for more information.

PARAMETER DESCRIPTION
feature_extractor

An instance of [Pop2PianoFeatureExtractor]. The feature extractor is a required input.

TYPE: `Pop2PianoFeatureExtractor`

tokenizer

An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.

TYPE: `Pop2PianoTokenizer`

Source code in mindnlp\transformers\models\pop2piano\processing_pop2piano.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class Pop2PianoProcessor(ProcessorMixin):
    r"""
    Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single
    processor.

    [`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`].
    See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information.

    Args:
        feature_extractor (`Pop2PianoFeatureExtractor`):
            An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`Pop2PianoTokenizer`):
            An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
    """
    attributes = ["feature_extractor", "tokenizer"]
    feature_extractor_class = "Pop2PianoFeatureExtractor"
    tokenizer_class = "Pop2PianoTokenizer"

    def __call__(
        self,
        audio: Union[np.ndarray, List[float], List[np.ndarray]] = None,
        sampling_rate: Union[int, List[int]] = None,
        steps_per_beat: int = 2,
        resample: Optional[bool] = True,
        notes: Union[List, TensorType] = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        verbose: bool = True,
        **kwargs,
    ) -> Union[BatchFeature, BatchEncoding]:
        """
        This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model,
        and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes.

        Please refer to the docstring of the above two methods for more information.
        """
        # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and
        # feature_extractor_output, we must check for both.
        if (audio is None and sampling_rate is None) and (notes is None):
            raise ValueError(
                "You have to specify at least audios and sampling_rate in order to use feature extractor or "
                "notes to use the tokenizer part."
            )

        if audio is not None and sampling_rate is not None:
            inputs = self.feature_extractor(
                audio=audio,
                sampling_rate=sampling_rate,
                steps_per_beat=steps_per_beat,
                resample=resample,
                **kwargs,
            )

        if notes is not None:
            encoded_token_ids = self.tokenizer(
                notes=notes,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                pad_to_multiple_of=pad_to_multiple_of,
                verbose=verbose,
                **kwargs,
            )

        if notes is None:
            return inputs

        if audio is None or sampling_rate is None:
            return encoded_token_ids

        inputs["token_ids"] = encoded_token_ids["token_ids"]
        return inputs

    def batch_decode(
        self,
        token_ids,
        feature_extractor_output: BatchFeature,
        return_midi: bool = True,
    ) -> BatchEncoding:
        """
        This method uses [`Pop2PianoTokenizer.batch_decode`] method to convert model generated token_ids to midi_notes.

        Please refer to the docstring of the above two methods for more information.
        """
        return self.tokenizer.batch_decode(
            token_ids=token_ids, feature_extractor_output=feature_extractor_output, return_midi=return_midi
        )

    @property
    def model_input_names(self):
        """
        Returns a list of model input names for the Pop2PianoProcessor.

        Args:
            self: The instance of the Pop2PianoProcessor class.

        Returns:
            None.

        Raises:
            None.
        """
        tokenizer_input_names = self.tokenizer.model_input_names
        feature_extractor_input_names = self.feature_extractor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))

    def save_pretrained(self, save_directory, **kwargs):
        """
        Save the model and its configuration file to a directory. If the directory does not exist, it will be created.

        Args:
            self (Pop2PianoProcessor): The instance of the Pop2PianoProcessor class.
            save_directory (str): The directory path where the model and its configuration file will be saved.
                It should be a directory and not a file.

        Returns:
            None.

        Raises:
            ValueError: If the provided save_directory already exists as a file instead of a directory.
        """
        if os.path.isfile(save_directory):
            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file.")
        os.makedirs(save_directory, exist_ok=True)
        return super().save_pretrained(save_directory, **kwargs)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """
        This method creates an instance of the Pop2PianoProcessor class from a pretrained model.

        Args:
            cls (class): The class object itself, automatically passed as the first argument.
            pretrained_model_name_or_path (str): The name or path of the pretrained model to be used for initialization.

        Returns:
            None.

        Raises:
            Any exceptions raised by the _get_arguments_from_pretrained method.
        """
        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
        return cls(*args)

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor.model_input_names property

Returns a list of model input names for the Pop2PianoProcessor.

PARAMETER DESCRIPTION
self

The instance of the Pop2PianoProcessor class.

RETURNS DESCRIPTION

None.

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor.__call__(audio=None, sampling_rate=None, steps_per_beat=2, resample=True, notes=None, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs)

This method uses [Pop2PianoFeatureExtractor.__call__] method to prepare log-mel-spectrograms for the model, and [Pop2PianoTokenizer.__call__] to prepare token_ids from notes.

Please refer to the docstring of the above two methods for more information.

Source code in mindnlp\transformers\models\pop2piano\processing_pop2piano.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def __call__(
    self,
    audio: Union[np.ndarray, List[float], List[np.ndarray]] = None,
    sampling_rate: Union[int, List[int]] = None,
    steps_per_beat: int = 2,
    resample: Optional[bool] = True,
    notes: Union[List, TensorType] = None,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = None,
    max_length: Optional[int] = None,
    pad_to_multiple_of: Optional[int] = None,
    verbose: bool = True,
    **kwargs,
) -> Union[BatchFeature, BatchEncoding]:
    """
    This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model,
    and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes.

    Please refer to the docstring of the above two methods for more information.
    """
    # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and
    # feature_extractor_output, we must check for both.
    if (audio is None and sampling_rate is None) and (notes is None):
        raise ValueError(
            "You have to specify at least audios and sampling_rate in order to use feature extractor or "
            "notes to use the tokenizer part."
        )

    if audio is not None and sampling_rate is not None:
        inputs = self.feature_extractor(
            audio=audio,
            sampling_rate=sampling_rate,
            steps_per_beat=steps_per_beat,
            resample=resample,
            **kwargs,
        )

    if notes is not None:
        encoded_token_ids = self.tokenizer(
            notes=notes,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

    if notes is None:
        return inputs

    if audio is None or sampling_rate is None:
        return encoded_token_ids

    inputs["token_ids"] = encoded_token_ids["token_ids"]
    return inputs

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor.batch_decode(token_ids, feature_extractor_output, return_midi=True)

This method uses [Pop2PianoTokenizer.batch_decode] method to convert model generated token_ids to midi_notes.

Please refer to the docstring of the above two methods for more information.

Source code in mindnlp\transformers\models\pop2piano\processing_pop2piano.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def batch_decode(
    self,
    token_ids,
    feature_extractor_output: BatchFeature,
    return_midi: bool = True,
) -> BatchEncoding:
    """
    This method uses [`Pop2PianoTokenizer.batch_decode`] method to convert model generated token_ids to midi_notes.

    Please refer to the docstring of the above two methods for more information.
    """
    return self.tokenizer.batch_decode(
        token_ids=token_ids, feature_extractor_output=feature_extractor_output, return_midi=return_midi
    )

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs) classmethod

This method creates an instance of the Pop2PianoProcessor class from a pretrained model.

PARAMETER DESCRIPTION
cls

The class object itself, automatically passed as the first argument.

TYPE: class

pretrained_model_name_or_path

The name or path of the pretrained model to be used for initialization.

TYPE: str

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\pop2piano\processing_pop2piano.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
    """
    This method creates an instance of the Pop2PianoProcessor class from a pretrained model.

    Args:
        cls (class): The class object itself, automatically passed as the first argument.
        pretrained_model_name_or_path (str): The name or path of the pretrained model to be used for initialization.

    Returns:
        None.

    Raises:
        Any exceptions raised by the _get_arguments_from_pretrained method.
    """
    args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
    return cls(*args)

mindnlp.transformers.models.pop2piano.processing_pop2piano.Pop2PianoProcessor.save_pretrained(save_directory, **kwargs)

Save the model and its configuration file to a directory. If the directory does not exist, it will be created.

PARAMETER DESCRIPTION
self

The instance of the Pop2PianoProcessor class.

TYPE: Pop2PianoProcessor

save_directory

The directory path where the model and its configuration file will be saved. It should be a directory and not a file.

TYPE: str

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the provided save_directory already exists as a file instead of a directory.

Source code in mindnlp\transformers\models\pop2piano\processing_pop2piano.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def save_pretrained(self, save_directory, **kwargs):
    """
    Save the model and its configuration file to a directory. If the directory does not exist, it will be created.

    Args:
        self (Pop2PianoProcessor): The instance of the Pop2PianoProcessor class.
        save_directory (str): The directory path where the model and its configuration file will be saved.
            It should be a directory and not a file.

    Returns:
        None.

    Raises:
        ValueError: If the provided save_directory already exists as a file instead of a directory.
    """
    if os.path.isfile(save_directory):
        raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file.")
    os.makedirs(save_directory, exist_ok=True)
    return super().save_pretrained(save_directory, **kwargs)

mindnlp.transformers.models.pop2piano.configuration_pop2piano

Pop2Piano model configuration

mindnlp.transformers.models.pop2piano.configuration_pop2piano.Pop2PianoConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [Pop2PianoForConditionalGeneration]. It is used to instantiate a Pop2PianoForConditionalGeneration model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Pop2Piano sweetcocoa/pop2piano architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER DESCRIPTION
vocab_size

Vocabulary size of the Pop2PianoForConditionalGeneration model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [Pop2PianoForConditionalGeneration].

TYPE: `int`, *optional*, defaults to 2400 DEFAULT: 2400

composer_vocab_size

Denotes the number of composers.

TYPE: `int`, *optional*, defaults to 21 DEFAULT: 21

d_model

Size of the encoder layers and the pooler layer.

TYPE: `int`, *optional*, defaults to 512 DEFAULT: 512

d_kv

Size of the key, query, value projections per attention head. The inner_dim of the projection layer will be defined as num_heads * d_kv.

TYPE: `int`, *optional*, defaults to 64 DEFAULT: 64

d_ff

Size of the intermediate feed forward layer in each Pop2PianoBlock.

TYPE: `int`, *optional*, defaults to 2048 DEFAULT: 2048

num_layers

Number of hidden layers in the Transformer encoder.

TYPE: `int`, *optional*, defaults to 6 DEFAULT: 6

num_decoder_layers

Number of hidden layers in the Transformer decoder. Will use the same value as num_layers if not set.

TYPE: `int`, *optional* DEFAULT: None

num_heads

Number of attention heads for each attention layer in the Transformer encoder.

TYPE: `int`, *optional*, defaults to 8 DEFAULT: 8

relative_attention_num_buckets

The number of buckets to use for each attention layer.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

relative_attention_max_distance

The maximum distance of the longer sequences for the bucket separation.

TYPE: `int`, *optional*, defaults to 128 DEFAULT: 128

dropout_rate

The ratio for all dropout layers.

TYPE: `float`, *optional*, defaults to 0.1 DEFAULT: 0.1

layer_norm_epsilon

The epsilon used by the layer normalization layers.

TYPE: `float`, *optional*, defaults to 1e-6 DEFAULT: 1e-06

initializer_factor

A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization testing).

TYPE: `float`, *optional*, defaults to 1.0 DEFAULT: 1.0

feed_forward_proj

Type of feed forward layer to be used. Should be one of "relu" or "gated-gelu".

TYPE: `string`, *optional*, defaults to `"gated-gelu"` DEFAULT: 'gated-gelu'

use_cache

Whether or not the model should return the last key/values attentions (not used by all models).

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

dense_act_fn

Type of Activation Function to be used in Pop2PianoDenseActDense and in Pop2PianoDenseGatedActDense.

TYPE: `string`, *optional*, defaults to `"relu"` DEFAULT: 'relu'

Source code in mindnlp\transformers\models\pop2piano\configuration_pop2piano.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class Pop2PianoConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Pop2PianoForConditionalGeneration`]. It is used
    to instantiate a Pop2PianoForConditionalGeneration model according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    Pop2Piano [sweetcocoa/pop2piano](https://huggingface.co/sweetcocoa/pop2piano) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 2400):
            Vocabulary size of the `Pop2PianoForConditionalGeneration` model. Defines the number of different tokens
            that can be represented by the `inputs_ids` passed when calling [`Pop2PianoForConditionalGeneration`].
        composer_vocab_size (`int`, *optional*, defaults to 21):
            Denotes the number of composers.
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
            be defined as `num_heads * d_kv`.
        d_ff (`int`, *optional*, defaults to 2048):
            Size of the intermediate feed forward layer in each `Pop2PianoBlock`.
        num_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        dense_act_fn (`string`, *optional*, defaults to `"relu"`):
            Type of Activation Function to be used in `Pop2PianoDenseActDense` and in `Pop2PianoDenseGatedActDense`.
    """

    model_type = "pop2piano"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=2400,
        composer_vocab_size=21,
        d_model=512,
        d_kv=64,
        d_ff=2048,
        num_layers=6,
        num_decoder_layers=None,
        num_heads=8,
        relative_attention_num_buckets=32,
        relative_attention_max_distance=128,
        dropout_rate=0.1,
        layer_norm_epsilon=1e-6,
        initializer_factor=1.0,
        feed_forward_proj="gated-gelu",  # noqa
        is_encoder_decoder=True,
        use_cache=True,
        pad_token_id=0,
        eos_token_id=1,
        dense_act_fn="relu",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.composer_vocab_size = composer_vocab_size
        self.d_model = d_model
        self.d_kv = d_kv
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers
        self.num_heads = num_heads
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.relative_attention_max_distance = relative_attention_max_distance
        self.dropout_rate = dropout_rate
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_factor = initializer_factor
        self.feed_forward_proj = feed_forward_proj
        self.use_cache = use_cache
        self.dense_act_fn = dense_act_fn
        self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated"
        self.hidden_size = self.d_model
        self.num_attention_heads = num_heads
        self.num_hidden_layers = num_layers

        super().__init__(
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            **kwargs,
        )

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano

Feature extractor class for Pop2Piano

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor

Bases: SequenceFeatureExtractor

Constructs a Pop2Piano feature extractor.

This feature extractor inherits from [~feature_extraction_sequence_utils.SequenceFeatureExtractor] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed to RhythmExtractor2013 algorithm which extracts the beat_times, beat positions and estimates their confidence as well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.

PARAMETER DESCRIPTION
sampling_rate

Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.

TYPE: `int`, *optional*, defaults to 22050 DEFAULT: 22050

padding_value

Padding value used to pad the audio. Should correspond to silences.

TYPE: `int`, *optional*, defaults to 0 DEFAULT: 0

window_size

Length of the window in samples to which the Fourier transform is applied.

TYPE: `int`, *optional*, defaults to 4096 DEFAULT: 4096

hop_length

Step size between each window of the waveform, in samples.

TYPE: `int`, *optional*, defaults to 1024 DEFAULT: 1024

min_frequency

Lowest frequency that will be used in the log-mel spectrogram.

TYPE: `float`, *optional*, defaults to 10.0 DEFAULT: 10.0

feature_size

The feature dimension of the extracted features.

TYPE: `int`, *optional*, defaults to 512 DEFAULT: 512

num_bars

Determines interval between each sequence.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a Pop2Piano feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
    to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
    well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
    extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
    and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.

    Args:
        sampling_rate (`int`, *optional*, defaults to 22050):
            Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.
        padding_value (`int`, *optional*, defaults to 0):
            Padding value used to pad the audio. Should correspond to silences.
        window_size (`int`, *optional*, defaults to 4096):
            Length of the window in samples to which the Fourier transform is applied.
        hop_length (`int`, *optional*, defaults to 1024):
            Step size between each window of the waveform, in samples.
        min_frequency (`float`, *optional*, defaults to 10.0):
            Lowest frequency that will be used in the log-mel spectrogram.
        feature_size (`int`, *optional*, defaults to 512):
            The feature dimension of the extracted features.
        num_bars (`int`, *optional*, defaults to 2):
            Determines interval between each sequence.
    """
    model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"]

    def __init__(
        self,
        sampling_rate: int = 22050,
        padding_value: int = 0,
        window_size: int = 4096,
        hop_length: int = 1024,
        min_frequency: float = 10.0,
        feature_size: int = 512,
        num_bars: int = 2,
        **kwargs,
    ):
        """
        Initializes a Pop2PianoFeatureExtractor object.

        Args:
            self: The Pop2PianoFeatureExtractor object itself.
            sampling_rate (int, optional): The sampling rate of the audio signal in Hz. Defaults to 22050.
            padding_value (int, optional): The value used for padding the audio signal. Defaults to 0.
            window_size (int, optional): The size of the analysis window in samples. Defaults to 4096.
            hop_length (int, optional): The number of samples between successive frames. Defaults to 1024.
            min_frequency (float, optional): The minimum frequency in Hz for the mel filters. Defaults to 10.0.
            feature_size (int, optional): The size of the output feature representation. Defaults to 512.
            num_bars (int, optional): The number of bars in each feature representation. Defaults to 2.
            **kwargs: Additional keyword arguments to be passed to the parent class forwardor.

        Returns:
            None.

        Raises:
            None.

        """
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            **kwargs,
        )
        self.sampling_rate = sampling_rate
        self.padding_value = padding_value
        self.window_size = window_size
        self.hop_length = hop_length
        self.min_frequency = min_frequency
        self.feature_size = feature_size
        self.num_bars = num_bars
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=(self.window_size // 2) + 1,
            num_mel_filters=self.feature_size,
            min_frequency=self.min_frequency,
            max_frequency=float(self.sampling_rate // 2),
            sampling_rate=self.sampling_rate,
            norm=None,
            mel_scale="htk",
        )

    def mel_spectrogram(self, sequence: np.ndarray):
        """
        Generates MelSpectrogram.

        Args:
            sequence (`numpy.ndarray`):
                The sequence of which the mel-spectrogram will be computed.
        """
        mel_specs = []
        for seq in sequence:
            window = np.hanning(self.window_size + 1)[:-1]
            mel_specs.append(
                spectrogram(
                    waveform=seq,
                    window=window,
                    frame_length=self.window_size,
                    hop_length=self.hop_length,
                    power=2.0,
                    mel_filters=self.mel_filters,
                )
            )
        mel_specs = np.array(mel_specs)

        return mel_specs

    def extract_rhythm(self, audio: np.ndarray):
        """
        This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
        tempo in bpm for an audio signal. For more information please visit
        https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .

        Args:
            audio(`numpy.ndarray`):
                raw audio waveform which is passed to the Rhythm Extractor.
        """
        requires_backends(self, ["essentia"])
        essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
        bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio)

        return bpm, beat_times, confidence, estimates, essentia_beat_intervals

    def interpolate_beat_times(
        self, beat_times: np.ndarray, steps_per_beat: np.ndarray, n_extend: np.ndarray
    ):
        """
        This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
        then used to convert raw audio to log-mel-spectrogram.

        Args:
            beat_times (`numpy.ndarray`):
                beat_times is passed into `scipy.interpolate.interp1d` for processing.
            steps_per_beat (`int`):
                used as an parameter to control the interpolation.
            n_extend (`int`):
                used as an parameter to control the interpolation.
        """
        requires_backends(self, ["scipy"])
        beat_times_function = scipy.interpolate.interp1d(
            np.arange(beat_times.size),
            beat_times,
            bounds_error=False,
            fill_value="extrapolate",
        )

        ext_beats = beat_times_function(
            np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend)
        )

        return ext_beats

    def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray):
        """
        Preprocessing for log-mel-spectrogram

        Args:
            audio (`numpy.ndarray` of shape `(audio_length, )` ):
                Raw audio waveform to be processed.
            beatstep (`numpy.ndarray`):
                Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
                the value at beatstep[0].
        """
        if audio is not None and len(audio.shape) != 1:
            raise ValueError(
                f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}."
            )
        if beatstep[0] > 0.0:
            beatstep = beatstep - beatstep[0]

        num_steps = self.num_bars * 4
        num_target_steps = len(beatstep)
        extrapolated_beatstep = self.interpolate_beat_times(
            beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1
        )

        sample_indices = []
        max_feature_length = 0
        for i in range(0, num_target_steps, num_steps):
            start_idx = i
            end_idx = min(i + num_steps, num_target_steps)
            start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate)
            end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate)
            sample_indices.append((start_sample, end_sample))
            max_feature_length = max(max_feature_length, end_sample - start_sample)
        padded_batch = []
        for start_sample, end_sample in sample_indices:
            feature = audio[start_sample:end_sample]
            padded_feature = np.pad(
                feature,
                ((0, max_feature_length - feature.shape[0]),),
                "constant",
                constant_values=0,
            )
            padded_batch.append(padded_feature)

        padded_batch = np.asarray(padded_batch)
        return padded_batch, extrapolated_beatstep

    def _pad(self, features: np.ndarray, add_zero_line=True):
        """
        Method _pad in class Pop2PianoFeatureExtractor.

        This method pads the input features and attention masks to ensure that they have the same shape for further processing.

        Args:
            self (Pop2PianoFeatureExtractor): The instance of the Pop2PianoFeatureExtractor class.
            features (np.ndarray): An array containing the input features to be padded. The shape of each feature
                should be consistent within the array.
            add_zero_line (bool): A flag indicating whether to add a zero line at the end of each padded feature.
                Defaults to True.

        Returns:
            tuple:
                A tuple containing the padded features and attention masks.

                - The padded features are of type np.ndarray and have been concatenated along the 0th axis.
                - The attention masks are also of type np.ndarray and have been concatenated along the 0th axis.

        Raises:
            ValueError: If the input features are not of type np.ndarray or if the feature shapes are inconsistent.
            TypeError: If the input features or attention masks are not of type np.ndarray.
            ValueError: If the add_zero_line parameter is not a boolean value.
        """
        features_shapes = [each_feature.shape for each_feature in features]
        attention_masks, padded_features = [], []
        for i, each_feature in enumerate(features):
            # To pad "input_features".
            if len(each_feature.shape) == 3:
                features_pad_value = max([*zip(*features_shapes)][1]) - features_shapes[i][1] # pylint: disable=potential-index-error
                attention_mask = np.ones(features_shapes[i][:2], dtype=np.int64)
                feature_padding = ((0, 0), (0, features_pad_value), (0, 0))
                attention_mask_padding = (feature_padding[0], feature_padding[1])

            # To pad "beatsteps" and "extrapolated_beatstep".
            else:
                each_feature = each_feature.reshape(1, -1)
                features_pad_value = max([*zip(*features_shapes)][0]) - features_shapes[i][0]
                attention_mask = np.ones(features_shapes[i], dtype=np.int64).reshape(1, -1)
                feature_padding = attention_mask_padding = ((0, 0), (0, features_pad_value))

            each_padded_feature = np.pad(each_feature, feature_padding, "constant", constant_values=self.padding_value)
            attention_mask = np.pad(
                attention_mask, attention_mask_padding, "constant", constant_values=self.padding_value
            )

            if add_zero_line:
                # if it is batched then we seperate each examples using zero array
                zero_array_len = max([*zip(*features_shapes)][1]) # pylint: disable=potential-index-error

                # we concatenate the zero array line here
                each_padded_feature = np.concatenate(
                    [each_padded_feature, np.zeros([1, zero_array_len, self.feature_size])], axis=0
                )
                attention_mask = np.concatenate(
                    [attention_mask, np.zeros([1, zero_array_len], dtype=attention_mask.dtype)], axis=0
                )

            padded_features.append(each_padded_feature)
            attention_masks.append(attention_mask)

        padded_features = np.concatenate(padded_features, axis=0).astype(np.float32)
        attention_masks = np.concatenate(attention_masks, axis=0).astype(np.int64)

        return padded_features, attention_masks

    def pad(
        self,
        inputs: BatchFeature,
        is_batched: bool,
        return_attention_mask: bool,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ):
        """
        Pads the inputs to same length and returns attention_mask.

        Args:
            inputs (`BatchFeature`):
                Processed audio features.
            is_batched (`bool`):
                Whether inputs are batched or not.
            return_attention_mask (`bool`):
                Whether to return attention mask or not.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                - If set, will return tensors instead of list of python integers. Acceptable values are:

                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return Numpy `np.ndarray` objects.

                - If nothing is specified, it will return list of `np.ndarray` arrays.

        Returns:
            `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
            to it:

            - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` -- Example:

                - 1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at
                the end indicating they are padded)
                - 0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)
                - 1, 1, 1, 1, 1 (audio 2)
                - 0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)
                - 1, 1, 1, 1, 1 (audio 3)

            - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
            - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size, max_extrapolated_beatstep_seq_length)`
        """
        processed_features_dict = {}
        for feature_name, feature_value in inputs.items():
            if feature_name == "input_features":
                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True)
                processed_features_dict[feature_name] = padded_feature_values
                if return_attention_mask:
                    processed_features_dict["attention_mask"] = attention_mask
            else:
                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False)
                processed_features_dict[feature_name] = padded_feature_values
                if return_attention_mask:
                    processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask

        # If we are processing only one example, we should remove the zero array line since we don't need it to
        # seperate examples from each other.
        if not is_batched and not return_attention_mask:
            processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]

        outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors)

        return outputs

    def __call__(
        self,
        audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        sampling_rate: Union[int, List[int]],
        steps_per_beat: int = 2,
        resample: Optional[bool] = True,
        return_attention_mask: Optional[bool] = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ) -> BatchFeature:
        """
        Main method to featurize and prepare for the model.

        Args:
            audio (`np.ndarray`, `List`):
                The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
                list of numpy arrays or a list of list of float values.
            sampling_rate (`int`):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            steps_per_beat (`int`, *optional*, defaults to 2):
                This is used in interpolating `beat_times`.
            resample (`bool`, *optional*, defaults to `True`):
                Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
                during inference.
            return_attention_mask (`bool` *optional*, defaults to `False`):
                Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
                output or not. Automatically set to True for batched inputs.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):

                - If set, will return tensors instead of list of python integers. Acceptable values are:

                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return Numpy `np.ndarray` objects.

                - If nothing is specified, it will return list of `np.ndarray` arrays.
        """
        requires_backends(self, ["librosa"])
        is_batched = bool(isinstance(audio, (list, tuple)) and isinstance(audio[0], (np.ndarray, tuple, list)))
        if is_batched:
            # This enables the user to process files of different sampling_rate at same time
            if not isinstance(sampling_rate, list):
                raise ValueError(
                    "Please give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. "
                    f"Received {sampling_rate}, expected [audio_1_sr, ..., audio_n_sr]."
                )
            return_attention_mask = True if return_attention_mask is None else return_attention_mask
        else:
            audio = [audio]
            sampling_rate = [sampling_rate]
            return_attention_mask = False if return_attention_mask is None else return_attention_mask

        batch_input_features, batch_beatsteps, batch_ext_beatstep = [], [], []
        for single_raw_audio, single_sampling_rate in zip(audio, sampling_rate):
            _, beat_times, _, _, _ = self.extract_rhythm(
                audio=single_raw_audio
            )
            beatsteps = self.interpolate_beat_times(beat_times=beat_times, steps_per_beat=steps_per_beat, n_extend=1)

            if self.sampling_rate != single_sampling_rate and self.sampling_rate is not None:
                if resample:
                    # Change sampling_rate to self.sampling_rate
                    single_raw_audio = librosa.core.resample(
                        single_raw_audio,
                        orig_sr=single_sampling_rate,
                        target_sr=self.sampling_rate,
                        res_type="kaiser_best",
                    )
                else:
                    warnings.warn(
                        f"The sampling_rate of the provided audio is different from the target sampling_rate "
                        f"of the Feature Extractor, {self.sampling_rate} vs {single_sampling_rate}. "
                        f"In these cases it is recommended to use `resample=True` in the `__call__` method to "
                        f"get the optimal behaviour."
                    )

            single_sampling_rate = self.sampling_rate
            start_sample = int(beatsteps[0] * single_sampling_rate)
            end_sample = int(beatsteps[-1] * single_sampling_rate)

            input_features, extrapolated_beatstep = self.preprocess_mel(
                single_raw_audio[start_sample:end_sample], beatsteps - beatsteps[0]
            )

            mel_specs = self.mel_spectrogram(input_features.astype(np.float32))

            # apply np.log to get log mel-spectrograms
            log_mel_specs = np.log(np.clip(mel_specs, a_min=1e-6, a_max=None))

            input_features = np.transpose(log_mel_specs, (0, -1, -2))

            batch_input_features.append(input_features)
            batch_beatsteps.append(beatsteps)
            batch_ext_beatstep.append(extrapolated_beatstep)

        output = BatchFeature(
            {
                "input_features": batch_input_features,
                "beatsteps": batch_beatsteps,
                "extrapolated_beatstep": batch_ext_beatstep,
            }
        )

        output = self.pad(
            output,
            is_batched=is_batched,
            return_attention_mask=return_attention_mask,
            return_tensors=return_tensors,
        )

        return output

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.__call__(audio, sampling_rate, steps_per_beat=2, resample=True, return_attention_mask=False, return_tensors=None, **kwargs)

Main method to featurize and prepare for the model.

PARAMETER DESCRIPTION
audio

The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values.

TYPE: `np.ndarray`, `List`

sampling_rate

The sampling rate at which the audio input was sampled. It is strongly recommended to pass sampling_rate at the forward call to prevent silent errors.

TYPE: `int`

steps_per_beat

This is used in interpolating beat_times.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

resample

Determines whether to resample the audio to sampling_rate or not before processing. Must be True during inference.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

return_attention_mask

Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as output or not. Automatically set to True for batched inputs.

TYPE: `bool` *optional*, defaults to `False` DEFAULT: False

return_tensors
  • If set, will return tensors instead of list of python integers. Acceptable values are:

    • 'pt': Return PyTorch torch.Tensor objects.
    • 'np': Return Numpy np.ndarray objects.
  • If nothing is specified, it will return list of np.ndarray arrays.

TYPE: `str` or [`~utils.TensorType`], *optional* DEFAULT: None

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
def __call__(
    self,
    audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
    sampling_rate: Union[int, List[int]],
    steps_per_beat: int = 2,
    resample: Optional[bool] = True,
    return_attention_mask: Optional[bool] = False,
    return_tensors: Optional[Union[str, TensorType]] = None,
    **kwargs,
) -> BatchFeature:
    """
    Main method to featurize and prepare for the model.

    Args:
        audio (`np.ndarray`, `List`):
            The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
            list of numpy arrays or a list of list of float values.
        sampling_rate (`int`):
            The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
            `sampling_rate` at the forward call to prevent silent errors.
        steps_per_beat (`int`, *optional*, defaults to 2):
            This is used in interpolating `beat_times`.
        resample (`bool`, *optional*, defaults to `True`):
            Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
            during inference.
        return_attention_mask (`bool` *optional*, defaults to `False`):
            Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
            output or not. Automatically set to True for batched inputs.
        return_tensors (`str` or [`~utils.TensorType`], *optional*):

            - If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.

            - If nothing is specified, it will return list of `np.ndarray` arrays.
    """
    requires_backends(self, ["librosa"])
    is_batched = bool(isinstance(audio, (list, tuple)) and isinstance(audio[0], (np.ndarray, tuple, list)))
    if is_batched:
        # This enables the user to process files of different sampling_rate at same time
        if not isinstance(sampling_rate, list):
            raise ValueError(
                "Please give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. "
                f"Received {sampling_rate}, expected [audio_1_sr, ..., audio_n_sr]."
            )
        return_attention_mask = True if return_attention_mask is None else return_attention_mask
    else:
        audio = [audio]
        sampling_rate = [sampling_rate]
        return_attention_mask = False if return_attention_mask is None else return_attention_mask

    batch_input_features, batch_beatsteps, batch_ext_beatstep = [], [], []
    for single_raw_audio, single_sampling_rate in zip(audio, sampling_rate):
        _, beat_times, _, _, _ = self.extract_rhythm(
            audio=single_raw_audio
        )
        beatsteps = self.interpolate_beat_times(beat_times=beat_times, steps_per_beat=steps_per_beat, n_extend=1)

        if self.sampling_rate != single_sampling_rate and self.sampling_rate is not None:
            if resample:
                # Change sampling_rate to self.sampling_rate
                single_raw_audio = librosa.core.resample(
                    single_raw_audio,
                    orig_sr=single_sampling_rate,
                    target_sr=self.sampling_rate,
                    res_type="kaiser_best",
                )
            else:
                warnings.warn(
                    f"The sampling_rate of the provided audio is different from the target sampling_rate "
                    f"of the Feature Extractor, {self.sampling_rate} vs {single_sampling_rate}. "
                    f"In these cases it is recommended to use `resample=True` in the `__call__` method to "
                    f"get the optimal behaviour."
                )

        single_sampling_rate = self.sampling_rate
        start_sample = int(beatsteps[0] * single_sampling_rate)
        end_sample = int(beatsteps[-1] * single_sampling_rate)

        input_features, extrapolated_beatstep = self.preprocess_mel(
            single_raw_audio[start_sample:end_sample], beatsteps - beatsteps[0]
        )

        mel_specs = self.mel_spectrogram(input_features.astype(np.float32))

        # apply np.log to get log mel-spectrograms
        log_mel_specs = np.log(np.clip(mel_specs, a_min=1e-6, a_max=None))

        input_features = np.transpose(log_mel_specs, (0, -1, -2))

        batch_input_features.append(input_features)
        batch_beatsteps.append(beatsteps)
        batch_ext_beatstep.append(extrapolated_beatstep)

    output = BatchFeature(
        {
            "input_features": batch_input_features,
            "beatsteps": batch_beatsteps,
            "extrapolated_beatstep": batch_ext_beatstep,
        }
    )

    output = self.pad(
        output,
        is_batched=is_batched,
        return_attention_mask=return_attention_mask,
        return_tensors=return_tensors,
    )

    return output

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.__init__(sampling_rate=22050, padding_value=0, window_size=4096, hop_length=1024, min_frequency=10.0, feature_size=512, num_bars=2, **kwargs)

Initializes a Pop2PianoFeatureExtractor object.

PARAMETER DESCRIPTION
self

The Pop2PianoFeatureExtractor object itself.

sampling_rate

The sampling rate of the audio signal in Hz. Defaults to 22050.

TYPE: int DEFAULT: 22050

padding_value

The value used for padding the audio signal. Defaults to 0.

TYPE: int DEFAULT: 0

window_size

The size of the analysis window in samples. Defaults to 4096.

TYPE: int DEFAULT: 4096

hop_length

The number of samples between successive frames. Defaults to 1024.

TYPE: int DEFAULT: 1024

min_frequency

The minimum frequency in Hz for the mel filters. Defaults to 10.0.

TYPE: float DEFAULT: 10.0

feature_size

The size of the output feature representation. Defaults to 512.

TYPE: int DEFAULT: 512

num_bars

The number of bars in each feature representation. Defaults to 2.

TYPE: int DEFAULT: 2

**kwargs

Additional keyword arguments to be passed to the parent class forwardor.

DEFAULT: {}

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def __init__(
    self,
    sampling_rate: int = 22050,
    padding_value: int = 0,
    window_size: int = 4096,
    hop_length: int = 1024,
    min_frequency: float = 10.0,
    feature_size: int = 512,
    num_bars: int = 2,
    **kwargs,
):
    """
    Initializes a Pop2PianoFeatureExtractor object.

    Args:
        self: The Pop2PianoFeatureExtractor object itself.
        sampling_rate (int, optional): The sampling rate of the audio signal in Hz. Defaults to 22050.
        padding_value (int, optional): The value used for padding the audio signal. Defaults to 0.
        window_size (int, optional): The size of the analysis window in samples. Defaults to 4096.
        hop_length (int, optional): The number of samples between successive frames. Defaults to 1024.
        min_frequency (float, optional): The minimum frequency in Hz for the mel filters. Defaults to 10.0.
        feature_size (int, optional): The size of the output feature representation. Defaults to 512.
        num_bars (int, optional): The number of bars in each feature representation. Defaults to 2.
        **kwargs: Additional keyword arguments to be passed to the parent class forwardor.

    Returns:
        None.

    Raises:
        None.

    """
    super().__init__(
        feature_size=feature_size,
        sampling_rate=sampling_rate,
        padding_value=padding_value,
        **kwargs,
    )
    self.sampling_rate = sampling_rate
    self.padding_value = padding_value
    self.window_size = window_size
    self.hop_length = hop_length
    self.min_frequency = min_frequency
    self.feature_size = feature_size
    self.num_bars = num_bars
    self.mel_filters = mel_filter_bank(
        num_frequency_bins=(self.window_size // 2) + 1,
        num_mel_filters=self.feature_size,
        min_frequency=self.min_frequency,
        max_frequency=float(self.sampling_rate // 2),
        sampling_rate=self.sampling_rate,
        norm=None,
        mel_scale="htk",
    )

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.extract_rhythm(audio)

This algorithm(RhythmExtractor2013) extracts the beat positions and estimates their confidence as well as tempo in bpm for an audio signal. For more information please visit https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .

PARAMETER DESCRIPTION
audio(`numpy.ndarray`)

raw audio waveform which is passed to the Rhythm Extractor.

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def extract_rhythm(self, audio: np.ndarray):
    """
    This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
    tempo in bpm for an audio signal. For more information please visit
    https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .

    Args:
        audio(`numpy.ndarray`):
            raw audio waveform which is passed to the Rhythm Extractor.
    """
    requires_backends(self, ["essentia"])
    essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
    bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio)

    return bpm, beat_times, confidence, estimates, essentia_beat_intervals

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.interpolate_beat_times(beat_times, steps_per_beat, n_extend)

This method takes beat_times and then interpolates that using scipy.interpolate.interp1d and the output is then used to convert raw audio to log-mel-spectrogram.

PARAMETER DESCRIPTION
beat_times

beat_times is passed into scipy.interpolate.interp1d for processing.

TYPE: `numpy.ndarray`

steps_per_beat

used as an parameter to control the interpolation.

TYPE: `int`

n_extend

used as an parameter to control the interpolation.

TYPE: `int`

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def interpolate_beat_times(
    self, beat_times: np.ndarray, steps_per_beat: np.ndarray, n_extend: np.ndarray
):
    """
    This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
    then used to convert raw audio to log-mel-spectrogram.

    Args:
        beat_times (`numpy.ndarray`):
            beat_times is passed into `scipy.interpolate.interp1d` for processing.
        steps_per_beat (`int`):
            used as an parameter to control the interpolation.
        n_extend (`int`):
            used as an parameter to control the interpolation.
    """
    requires_backends(self, ["scipy"])
    beat_times_function = scipy.interpolate.interp1d(
        np.arange(beat_times.size),
        beat_times,
        bounds_error=False,
        fill_value="extrapolate",
    )

    ext_beats = beat_times_function(
        np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend)
    )

    return ext_beats

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.mel_spectrogram(sequence)

Generates MelSpectrogram.

PARAMETER DESCRIPTION
sequence

The sequence of which the mel-spectrogram will be computed.

TYPE: `numpy.ndarray`

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def mel_spectrogram(self, sequence: np.ndarray):
    """
    Generates MelSpectrogram.

    Args:
        sequence (`numpy.ndarray`):
            The sequence of which the mel-spectrogram will be computed.
    """
    mel_specs = []
    for seq in sequence:
        window = np.hanning(self.window_size + 1)[:-1]
        mel_specs.append(
            spectrogram(
                waveform=seq,
                window=window,
                frame_length=self.window_size,
                hop_length=self.hop_length,
                power=2.0,
                mel_filters=self.mel_filters,
            )
        )
    mel_specs = np.array(mel_specs)

    return mel_specs

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.pad(inputs, is_batched, return_attention_mask, return_tensors=None)

Pads the inputs to same length and returns attention_mask.

PARAMETER DESCRIPTION
inputs

Processed audio features.

TYPE: `BatchFeature`

is_batched

Whether inputs are batched or not.

TYPE: `bool`

return_attention_mask

Whether to return attention mask or not.

TYPE: `bool`

return_tensors
  • If set, will return tensors instead of list of python integers. Acceptable values are:

    • 'pt': Return PyTorch torch.Tensor objects.
    • 'np': Return Numpy np.ndarray objects.
  • If nothing is specified, it will return list of np.ndarray arrays.

TYPE: `str` or [`~utils.TensorType`], *optional* DEFAULT: None

RETURNS DESCRIPTION

BatchFeature with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added

to it:

  • attention_mask numpy.ndarray of shape (batch_size, max_input_features_seq_length) -- Example:

  • 1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at the end indicating they are padded)

  • 0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)
  • 1, 1, 1, 1, 1 (audio 2)
  • 0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)
  • 1, 1, 1, 1, 1 (audio 3)
  • attention_mask_beatsteps numpy.ndarray of shape (batch_size, max_beatsteps_seq_length)
  • attention_mask_extrapolated_beatstep numpy.ndarray of shape (batch_size, max_extrapolated_beatstep_seq_length)
Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def pad(
    self,
    inputs: BatchFeature,
    is_batched: bool,
    return_attention_mask: bool,
    return_tensors: Optional[Union[str, TensorType]] = None,
):
    """
    Pads the inputs to same length and returns attention_mask.

    Args:
        inputs (`BatchFeature`):
            Processed audio features.
        is_batched (`bool`):
            Whether inputs are batched or not.
        return_attention_mask (`bool`):
            Whether to return attention mask or not.
        return_tensors (`str` or [`~utils.TensorType`], *optional*):
            - If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.

            - If nothing is specified, it will return list of `np.ndarray` arrays.

    Returns:
        `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
        to it:

        - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` -- Example:

            - 1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at
            the end indicating they are padded)
            - 0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)
            - 1, 1, 1, 1, 1 (audio 2)
            - 0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)
            - 1, 1, 1, 1, 1 (audio 3)

        - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
        - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size, max_extrapolated_beatstep_seq_length)`
    """
    processed_features_dict = {}
    for feature_name, feature_value in inputs.items():
        if feature_name == "input_features":
            padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True)
            processed_features_dict[feature_name] = padded_feature_values
            if return_attention_mask:
                processed_features_dict["attention_mask"] = attention_mask
        else:
            padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False)
            processed_features_dict[feature_name] = padded_feature_values
            if return_attention_mask:
                processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask

    # If we are processing only one example, we should remove the zero array line since we don't need it to
    # seperate examples from each other.
    if not is_batched and not return_attention_mask:
        processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]

    outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors)

    return outputs

mindnlp.transformers.models.pop2piano.feature_extraction_pop2piano.Pop2PianoFeatureExtractor.preprocess_mel(audio, beatstep)

Preprocessing for log-mel-spectrogram

PARAMETER DESCRIPTION
audio

Raw audio waveform to be processed.

TYPE: `numpy.ndarray` of shape `(audio_length, )`

beatstep

Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by the value at beatstep[0].

TYPE: `numpy.ndarray`

Source code in mindnlp\transformers\models\pop2piano\feature_extraction_pop2piano.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray):
    """
    Preprocessing for log-mel-spectrogram

    Args:
        audio (`numpy.ndarray` of shape `(audio_length, )` ):
            Raw audio waveform to be processed.
        beatstep (`numpy.ndarray`):
            Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
            the value at beatstep[0].
    """
    if audio is not None and len(audio.shape) != 1:
        raise ValueError(
            f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}."
        )
    if beatstep[0] > 0.0:
        beatstep = beatstep - beatstep[0]

    num_steps = self.num_bars * 4
    num_target_steps = len(beatstep)
    extrapolated_beatstep = self.interpolate_beat_times(
        beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1
    )

    sample_indices = []
    max_feature_length = 0
    for i in range(0, num_target_steps, num_steps):
        start_idx = i
        end_idx = min(i + num_steps, num_target_steps)
        start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate)
        end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate)
        sample_indices.append((start_sample, end_sample))
        max_feature_length = max(max_feature_length, end_sample - start_sample)
    padded_batch = []
    for start_sample, end_sample in sample_indices:
        feature = audio[start_sample:end_sample]
        padded_feature = np.pad(
            feature,
            ((0, max_feature_length - feature.shape[0]),),
            "constant",
            constant_values=0,
        )
        padded_batch.append(padded_feature)

    padded_batch = np.asarray(padded_batch)
    return padded_batch, extrapolated_beatstep