跳转至

encodec

mindnlp.transformers.models.encodec.modeling_encodec

PyTorch EnCodec model.

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d

Bases: Module

Conv1d with asymmetric or causal padding and normalization.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class EncodecConv1d(nn.Module):
    """Conv1d with asymmetric or causal padding and normalization."""

    def __init__(
        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
    ):
        super().__init__()
        self.causal = config.use_causal_conv
        self.pad_mode = config.pad_mode
        self.norm_type = config.norm_type

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        # warn user on unusual setup between dilation and stride
        if stride > 1 and dilation > 1:
            logger.warning(
                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
            )

        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
        if self.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)
        elif self.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

        kernel_size = self.conv.kernel_size[0]
        stride = mindspore.tensor(self.conv.stride[0], dtype=mindspore.int64)
        dilation = self.conv.dilation[0]

        # Effective kernel size with dilations.
        kernel_size = mindspore.tensor((kernel_size - 1) * dilation + 1, dtype=mindspore.int64)

        self.register_buffer("stride", stride, persistent=False)
        self.register_buffer("kernel_size", kernel_size, persistent=False)
        self.register_buffer("padding_total", mindspore.tensor(kernel_size - stride, dtype=mindspore.int64), persistent=False)

    def _get_extra_padding_for_conv1d(
        self,
        hidden_states: mindspore.Tensor,
    ) -> mindspore.Tensor:
        """See `pad_for_conv1d`."""
        length = hidden_states.shape[-1]
        n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
        n_frames = ops.ceil(n_frames).to(mindspore.int64) - 1
        ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total

        return ideal_length - length

    @staticmethod
    def _pad1d(hidden_states: mindspore.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
        """Tiny wrapper around nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        """
        length = hidden_states.shape[-1]
        padding_left, padding_right = paddings
        if mode != 'reflect':
            return nn.functional.pad(hidden_states, paddings, mode, value)

        max_pad = max(padding_left, padding_right)
        extra_pad = 0
        if length <= max_pad:
            extra_pad = max_pad - length + 1
            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
        padded = nn.functional.pad(hidden_states, paddings, mode, value)
        end = padded.shape[-1] - extra_pad
        return padded[..., :end]

    def forward(self, hidden_states):
        extra_padding = self._get_extra_padding_for_conv1d(hidden_states).item()

        if self.causal:
            # Left padding for causal
            hidden_states = self._pad1d(hidden_states, (self.padding_total.item(), extra_padding), mode=self.pad_mode)
        else:
            # Asymmetric padding required for odd strides
            padding_right = self.padding_total.item() // 2
            padding_left = self.padding_total.item() - padding_right
            hidden_states = self._pad1d(
                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
            )

        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d

Bases: Module

ConvTranspose1d with asymmetric or causal padding and normalization.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
class EncodecConvTranspose1d(nn.Module):
    """ConvTranspose1d with asymmetric or causal padding and normalization."""

    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
        super().__init__()
        self.causal = config.use_causal_conv
        self.trim_right_ratio = config.trim_right_ratio
        self.norm_type = config.norm_type
        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
        if config.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)
        elif config.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

        if not (self.causal or self.trim_right_ratio == 1.0):
            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")

    def forward(self, hidden_states):
        kernel_size = self.conv.kernel_size[0]
        stride = self.conv.stride[0]
        padding_total = kernel_size - stride

        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
        # removed at the very end, when keeping only the right length for the output,
        # as removing it here would require also passing the length at the matching layer
        # in the encoder.
        if self.causal:
            # Trim the padding on the right according to the specified ratio
            # if trim_right_ratio = 1.0, trim everything from right
            padding_right = math.ceil(padding_total * self.trim_right_ratio)
        else:
            # Asymmetric padding required for odd strides
            padding_right = padding_total // 2

        padding_left = padding_total - padding_right

        # unpad
        end = hidden_states.shape[-1] - padding_right
        hidden_states = hidden_states[..., padding_left:end]
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder

Bases: Module

SEANet decoder as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
class EncodecDecoder(nn.Module):
    """SEANet decoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        scaling = int(2 ** len(config.upsampling_ratios))
        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]

        model += [EncodecLSTM(config, scaling * config.num_filters)]

        # Upsample to raw audio scale
        for ratio in config.upsampling_ratios:
            current_scale = scaling * config.num_filters
            # Add upsampling layers
            model += [nn.ELU()]
            model += [
                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
            ]
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
            scaling //= 2

        # Add final layers
        model += [nn.ELU()]
        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoderOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_values

Decoded audio values, obtained using the decoder part of Encodec.

TYPE: `mindspore.Tensor` of shape `(batch_size, segment_length)`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
67
68
69
70
71
72
73
74
75
@dataclass
class EncodecDecoderOutput(ModelOutput):
    """
    Args:
        audio_values (`mindspore.Tensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Encodec.
    """

    audio_values: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder

Bases: Module

SEANet encoder as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
class EncodecEncoder(nn.Module):
    """SEANet encoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
        scaling = 1

        # Downsample to raw audio scale
        for ratio in reversed(config.upsampling_ratios):
            current_scale = scaling * config.num_filters
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
            # Add downsampling layers
            model += [nn.ELU()]
            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
            scaling *= 2

        model += [EncodecLSTM(config, scaling * config.num_filters)]
        model += [nn.ELU()]
        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]

        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoderOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional* DEFAULT: None

audio_scales

Scaling factor for each audio_codes input. This is used to unscale each chunk of audio when decoding.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
53
54
55
56
57
58
59
60
61
62
63
64
@dataclass
class EncodecEncoderOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    """

    audio_codes: mindspore.Tensor = None
    audio_scales: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook

Bases: Module

Codebook with Euclidean distance.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
class EncodecEuclideanCodebook(nn.Module):
    """Codebook with Euclidean distance."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        embed = ops.zeros(config.codebook_size, config.codebook_dim)

        self.codebook_size = config.codebook_size

        self.register_buffer("inited", mindspore.Tensor([True]))
        self.register_buffer("cluster_size", ops.zeros(config.codebook_size))
        self.register_buffer("embed", embed)
        self.register_buffer("embed_avg", embed.copy())

    def quantize(self, hidden_states):
        embed = self.embed.t()
        scaled_states = ops.sum(hidden_states.pow(2), 1, keepdim=True)
        dist = -(scaled_states - 2 * hidden_states @ embed + ops.sum(embed.pow(2), 0, keepdim=True))
        embed_ind = ops.max(dist, dim=-1)[1]
        return embed_ind

    def encode(self, hidden_states):
        shape = hidden_states.shape
        # pre-process
        hidden_states = hidden_states.reshape((-1, shape[-1]))
        # quantize
        embed_ind = self.quantize(hidden_states)
        # post-process
        embed_ind = embed_ind.view(*shape[:-1])
        return embed_ind

    def decode(self, embed_ind):
        quantize = nn.functional.embedding(embed_ind, self.embed)
        return quantize

mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM

Bases: Module

LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class EncodecLSTM(nn.Module):
    """
    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
    """

    def __init__(self, config, dimension):
        super().__init__()
        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)

    def forward(self, hidden_states):
        hidden_states = hidden_states.permute(2, 0, 1)
        hidden_states = self.lstm(hidden_states)[0] + hidden_states
        hidden_states = hidden_states.permute(1, 2, 0)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel

Bases: EncodecPreTrainedModel

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
class EncodecModel(EncodecPreTrainedModel):
    def __init__(self, config: EncodecConfig):
        super().__init__(config)
        self.config = config

        self.encoder = EncodecEncoder(config)
        self.decoder = EncodecDecoder(config)

        self.quantizer = EncodecResidualVectorQuantizer(config)

        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
        if 2**self.bits_per_codebook != self.config.codebook_size:
            raise ValueError("The codebook_size must be a power of 2.")

        # Initialize weights and apply final processing
        self.post_init()

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def _encode_frame(
        self, input_values: mindspore.Tensor, bandwidth: float, padding_mask: int
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]:
        """
        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
        normalized. The padding mask is required to compute the correct scale.
        """
        length = input_values.shape[-1]
        duration = length / self.config.sampling_rate

        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")

        scale = None
        if self.config.normalize:
            # if the padding is non zero
            input_values = input_values * padding_mask
            mono = ops.sum(input_values, 1, keepdim=True) / input_values.shape[1]
            scale = ops.mean(mono.pow(2), dim=-1, keepdim=True).sqrt() + 1e-8
            input_values = input_values / scale

        embeddings = self.encoder(input_values)
        codes = self.quantizer.encode(embeddings, bandwidth)
        codes = ops.transpose(codes, 0, 1)
        return codes, scale

    def encode(
        self,
        input_values: mindspore.Tensor,
        padding_mask: mindspore.Tensor = None,
        bandwidth: Optional[float] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
        """
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            bandwidth (`float`, *optional*):
                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
                as bandwidth == 6.0

        Returns:
            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
            `codebook` of shape `[batch_size, num_codebooks, frames]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if bandwidth is None:
            bandwidth = self.config.target_bandwidths[0]
        if bandwidth not in self.config.target_bandwidths:
            raise ValueError(
                f"This model doesn't support the bandwidth {bandwidth}. "
                f"Select one of {self.config.target_bandwidths}."
            )

        _, channels, input_length = input_values.shape

        if channels < 1 or channels > 2:
            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            chunk_length = input_length
            stride = input_length
        else:
            stride = self.config.chunk_stride

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        encoded_frames = []
        scales = []

        step = chunk_length - stride
        if (input_length % stride) - step != 0:
            raise ValueError(
                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
            )

        for offset in range(0, input_length - step, stride):
            mask = padding_mask[..., offset : offset + chunk_length].bool()
            frame = input_values[:, :, offset : offset + chunk_length]
            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
            encoded_frames.append(encoded_frame)
            scales.append(scale)

        encoded_frames = ops.stack(encoded_frames)

        if not return_dict:
            return (encoded_frames, scales)

        return EncodecEncoderOutput(encoded_frames, scales)

    @staticmethod
    def _linear_overlap_add(frames: List[mindspore.Tensor], stride: int):
        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
        # e.g., more than 2 frames per position.
        # The core idea is to use a weight function that is a triangle,
        # with a maximum value at the middle of the chunk.
        # We use this weighting when summing the frames, and divide by the sum of weights
        # for each positions at the end. Thus:
        #   - if a frame is the only one to cover a position, the weighting is a no-op.
        #   - if 2 frames cover a position:
        #          ...  ...
        #         /   \/   \
        #        /    /\    \
        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
        # After the final normalization, the weight of the second frame at position `t` is
        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
        #
        #   - if more than 2 frames overlap at a given point, we hope that by induction
        #      something sensible happens.
        if len(frames) == 0:
            raise ValueError("`frames` cannot be an empty list.")

        dtype = frames[0].dtype
        shape = frames[0].shape[:-1]
        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]

        frame_length = frames[0].shape[-1]
        time_vec = ops.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
        weight = 0.5 - (time_vec - 0.5).abs()

        sum_weight = ops.zeros(total_size, dtype=dtype)
        out = ops.zeros(*shape, total_size, dtype=dtype)
        offset: int = 0

        for frame in frames:
            frame_length = frame.shape[-1]
            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
            sum_weight[offset : offset + frame_length] += weight[:frame_length]
            offset += stride

        if sum_weight.min() == 0:
            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")

        return out / sum_weight

    def _decode_frame(self, codes: mindspore.Tensor, scale: Optional[mindspore.Tensor] = None) -> mindspore.Tensor:
        codes = ops.transpose(codes, 0, 1)
        embeddings = self.quantizer.decode(codes)
        outputs = self.decoder(embeddings)
        if scale is not None:
            outputs = outputs * scale.view(-1, 1, 1)
        return outputs

    def decode(
        self,
        audio_codes: mindspore.Tensor,
        audio_scales: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
        """
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            if len(audio_codes) != 1:
                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
        else:
            decoded_frames = []

            for frame, scale in zip(audio_codes, audio_scales):
                frames = self._decode_frame(frame, scale)
                decoded_frames.append(frames)

            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

        # truncate based on padding mask
        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
            audio_values = audio_values[..., : padding_mask.shape[-1]]

        if not return_dict:
            return (audio_values,)
        return EncodecDecoderOutput(audio_values)

    def forward(
        self,
        input_values: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        bandwidth: Optional[float] = None,
        audio_codes: Optional[mindspore.Tensor] = None,
        audio_scales: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
        r"""
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, EncodecModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model_id = "facebook/encodec_24khz"
        >>> model = EncodecModel.from_pretrained(model_id)
        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        if audio_codes is not None and audio_scales is None:
            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

        if audio_scales is not None and audio_codes is None:
            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

        if audio_scales is None and audio_codes is None:
            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
        if not return_dict:
            return (audio_codes, audio_values)

        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.decode(audio_codes, audio_scales, padding_mask=None, return_dict=None)

Decodes the given frames into an output audio waveform.

Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be trimmed.

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*

audio_scales

Scaling factor for each audio_codes input.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*

padding_mask

Padding mask used to pad the input_values.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: None

return_dict

Whether or not to return a [~utils.ModelOutput] instead of a plain tuple.

TYPE: `bool`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
def decode(
    self,
    audio_codes: mindspore.Tensor,
    audio_scales: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
    """
    Decodes the given frames into an output audio waveform.

    Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
    trimmed.

    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

    """
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        if len(audio_codes) != 1:
            raise ValueError(f"Expected one frame, got {len(audio_codes)}")
        audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
    else:
        decoded_frames = []

        for frame, scale in zip(audio_codes, audio_scales):
            frames = self._decode_frame(frame, scale)
            decoded_frames.append(frames)

        audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

    # truncate based on padding mask
    if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
        audio_values = audio_values[..., : padding_mask.shape[-1]]

    if not return_dict:
        return (audio_values,)
    return EncodecDecoderOutput(audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.encode(input_values, padding_mask=None, bandwidth=None, return_dict=None)

Encodes the input audio waveform into discrete codes.

PARAMETER DESCRIPTION
input_values

Float values of the input audio waveform.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`

padding_mask

Padding mask used to pad the input_values.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: None

bandwidth

The target bandwidth. Must be one of config.target_bandwidths. If None, uses the smallest possible bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as bandwidth == 6.0

TYPE: `float`, *optional* DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling

Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

factors for each chunk when normalize is True. Each frames is a tuple (codebook, scale), with

Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

codebook of shape [batch_size, num_codebooks, frames].

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
def encode(
    self,
    input_values: mindspore.Tensor,
    padding_mask: mindspore.Tensor = None,
    bandwidth: Optional[float] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
    """
    Encodes the input audio waveform into discrete codes.

    Args:
        input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Float values of the input audio waveform.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        bandwidth (`float`, *optional*):
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
            as bandwidth == 6.0

    Returns:
        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
        `codebook` of shape `[batch_size, num_codebooks, frames]`.
    """
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    if bandwidth is None:
        bandwidth = self.config.target_bandwidths[0]
    if bandwidth not in self.config.target_bandwidths:
        raise ValueError(
            f"This model doesn't support the bandwidth {bandwidth}. "
            f"Select one of {self.config.target_bandwidths}."
        )

    _, channels, input_length = input_values.shape

    if channels < 1 or channels > 2:
        raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        chunk_length = input_length
        stride = input_length
    else:
        stride = self.config.chunk_stride

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    encoded_frames = []
    scales = []

    step = chunk_length - stride
    if (input_length % stride) - step != 0:
        raise ValueError(
            "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
        )

    for offset in range(0, input_length - step, stride):
        mask = padding_mask[..., offset : offset + chunk_length].bool()
        frame = input_values[:, :, offset : offset + chunk_length]
        encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
        encoded_frames.append(encoded_frame)
        scales.append(scale)

    encoded_frames = ops.stack(encoded_frames)

    if not return_dict:
        return (encoded_frames, scales)

    return EncodecEncoderOutput(encoded_frames, scales)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.forward(input_values, padding_mask=None, bandwidth=None, audio_codes=None, audio_scales=None, return_dict=None)

Examples:

>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, EncodecModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model_id = "facebook/encodec_24khz"
>>> model = EncodecModel.from_pretrained(model_id)
>>> processor = AutoProcessor.from_pretrained(model_id)

>>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values
Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
def forward(
    self,
    input_values: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    bandwidth: Optional[float] = None,
    audio_codes: Optional[mindspore.Tensor] = None,
    audio_scales: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
    r"""
    Returns:

    Examples:

    ```python
    >>> from datasets import load_dataset
    >>> from transformers import AutoProcessor, EncodecModel

    >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
    >>> audio_sample = dataset["train"]["audio"][0]["array"]

    >>> model_id = "facebook/encodec_24khz"
    >>> model = EncodecModel.from_pretrained(model_id)
    >>> processor = AutoProcessor.from_pretrained(model_id)

    >>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

    >>> outputs = model(**inputs)
    >>> audio_codes = outputs.audio_codes
    >>> audio_values = outputs.audio_values
    ```"""
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    if audio_codes is not None and audio_scales is None:
        raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

    if audio_scales is not None and audio_codes is None:
        raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

    if audio_scales is None and audio_codes is None:
        audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

    audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
    if not return_dict:
        return (audio_codes, audio_values)

    return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
39
40
41
42
43
44
45
46
47
48
49
50
@dataclass
class EncodecOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    """

    audio_codes: mindspore.Tensor = None
    audio_values: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
class EncodecPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = EncodecConfig
    base_model_prefix = "encodec"
    main_input_name = "input_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.constant_(param, 0.0)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer

Bases: Module

Residual Vector Quantizer.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
class EncodecResidualVectorQuantizer(nn.Module):
    """Residual Vector Quantizer."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        self.codebook_size = config.codebook_size
        self.frame_rate = config.frame_rate
        self.num_quantizers = config.num_quantizers
        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])

    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
        """Return num_quantizers based on specified target bandwidth."""
        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
        num_quantizers = self.num_quantizers
        if bandwidth is not None and bandwidth > 0.0:
            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
        return num_quantizers

    def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
        """
        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        """
        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
        residual = embeddings
        all_indices = []
        for layer in self.layers[:num_quantizers]:
            indices = layer.encode(residual)
            quantized = layer.decode(indices)
            residual = residual - quantized
            all_indices.append(indices)
        out_indices = ops.stack(all_indices)
        return out_indices

    def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
        """Decode the given codes to the quantized representation."""
        quantized_out = mindspore.tensor(0.0)
        for i, indices in enumerate(codes):
            layer = self.layers[i]
            quantized = layer.decode(indices)
            quantized_out = quantized_out.to(quantized.dtype) + quantized
        return quantized_out

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.decode(codes)

Decode the given codes to the quantized representation.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
427
428
429
430
431
432
433
434
def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
    """Decode the given codes to the quantized representation."""
    quantized_out = mindspore.tensor(0.0)
    for i, indices in enumerate(codes):
        layer = self.layers[i]
        quantized = layer.decode(indices)
        quantized_out = quantized_out.to(quantized.dtype) + quantized
    return quantized_out

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.encode(embeddings, bandwidth=None)

Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizers to use and returns indices for each quantizer.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
    """
    Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
    the appropriate number of quantizers to use and returns indices for each quantizer.
    """
    num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
    residual = embeddings
    all_indices = []
    for layer in self.layers[:num_quantizers]:
        indices = layer.encode(residual)
        quantized = layer.decode(indices)
        residual = residual - quantized
        all_indices.append(indices)
    out_indices = ops.stack(all_indices)
    return out_indices

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth(bandwidth=None)

Return num_quantizers based on specified target bandwidth.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
403
404
405
406
407
408
409
def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
    """Return num_quantizers based on specified target bandwidth."""
    bw_per_q = math.log2(self.codebook_size) * self.frame_rate
    num_quantizers = self.num_quantizers
    if bandwidth is not None and bandwidth > 0.0:
        num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
    return num_quantizers

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock

Bases: Module

Residual block from SEANet model as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
class EncodecResnetBlock(nn.Module):
    """
    Residual block from SEANet model as used by EnCodec.
    """

    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
        super().__init__()
        kernel_sizes = (config.residual_kernel_size, 1)
        if len(kernel_sizes) != len(dilations):
            raise ValueError("Number of kernel sizes should match number of dilations")

        hidden = dim // config.compress
        block = []
        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
            in_chs = dim if i == 0 else hidden
            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
            block += [nn.ELU()]
            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
        self.block = nn.ModuleList(block)

        if config.use_conv_shortcut:
            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
        else:
            self.shortcut = nn.Identity()

    def forward(self, hidden_states):
        residual = hidden_states
        for layer in self.block:
            hidden_states = layer(hidden_states)

        return self.shortcut(residual) + hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization

Bases: Module

Vector quantization implementation. Currently supports only euclidean distance.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
class EncodecVectorQuantization(nn.Module):
    """
    Vector quantization implementation. Currently supports only euclidean distance.
    """

    def __init__(self, config: EncodecConfig):
        super().__init__()
        self.codebook = EncodecEuclideanCodebook(config)

    def encode(self, hidden_states):
        hidden_states = hidden_states.permute(0, 2, 1)
        embed_in = self.codebook.encode(hidden_states)
        return embed_in

    def decode(self, embed_ind):
        quantize = self.codebook.decode(embed_ind)
        quantize = quantize.permute(0, 2, 1)
        return quantize

mindnlp.transformers.models.encodec.configuration_encodec

Encodec Model config

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of an [EncodecModel]. It is used to instantiate a Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER DESCRIPTION
target_bandwidths

The range of diffent bandwiths the model can encode audio with.

TYPE: `List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]` DEFAULT: [1.5, 3.0, 6.0, 12.0, 24.0]

sampling_rate

The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).

TYPE: `int`, *optional*, defaults to 24000 DEFAULT: 24000

audio_channels

Number of channels in the audio data. Either 1 for mono or 2 for stereo.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

normalize

Whether the audio shall be normalized when passed.

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

chunk_length_s

If defined the audio is pre-processed into chunks of lengths chunk_length_s and then encoded.

TYPE: `float`, *optional* DEFAULT: None

overlap

Defines the overlap between each chunk. It is used to compute the chunk_stride using the following formulae : int((1.0 - self.overlap) * self.chunk_length).

TYPE: `float`, *optional* DEFAULT: None

hidden_size

Intermediate representation dimension.

TYPE: `int`, *optional*, defaults to 128 DEFAULT: 128

num_filters

Number of convolution kernels of first EncodecConv1d down sampling layer.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

num_residual_layers

Number of residual layers.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

upsampling_ratios

Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here that must match the decoder order.

TYPE: `Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]` DEFAULT: [8, 5, 4, 2]

norm_type

Normalization method. Should be in ["weight_norm", "time_group_norm"]

TYPE: `str`, *optional*, defaults to `"weight_norm"` DEFAULT: 'weight_norm'

kernel_size

Kernel size for the initial convolution.

TYPE: `int`, *optional*, defaults to 7 DEFAULT: 7

last_kernel_size

Kernel size for the last convolution layer.

TYPE: `int`, *optional*, defaults to 7 DEFAULT: 7

residual_kernel_size

Kernel size for the residual layers.

TYPE: `int`, *optional*, defaults to 3 DEFAULT: 3

dilation_growth_rate

How much to increase the dilation with each layer.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

use_causal_conv

Whether to use fully causal convolution.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

pad_mode

Padding mode for the convolutions.

TYPE: `str`, *optional*, defaults to `"reflect"` DEFAULT: 'reflect'

compress

Reduced dimensionality in residual branches (from Demucs v3).

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

num_lstm_layers

Number of LSTM layers at the end of the encoder.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

trim_right_ratio

Ratio for trimming at the right of the transposed convolution under the use_causal_conv = True setup. If equal to 1.0, it means that all the trimming is done at the right.

TYPE: `float`, *optional*, defaults to 1.0 DEFAULT: 1.0

codebook_size

Number of discret codes that make up VQVAE.

TYPE: `int`, *optional*, defaults to 1024 DEFAULT: 1024

codebook_dim

Dimension of the codebook vectors. If not defined, uses hidden_size.

TYPE: `int`, *optional* DEFAULT: None

use_conv_shortcut

Whether to use a convolutional layer as the 'skip' connection in the EncodecResnetBlock block. If False, an identity function will be used, giving a generic residual connection.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

Example
>>> from transformers import EncodecModel, EncodecConfig
...
>>> # Initializing a "facebook/encodec_24khz" style configuration
>>> configuration = EncodecConfig()
...
>>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
>>> model = EncodecModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
Source code in mindnlp\transformers\models\encodec\configuration_encodec.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class EncodecConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
            The range of diffent bandwiths the model can encode audio with.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        audio_channels (`int`, *optional*, defaults to 1):
            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether the audio shall be normalized when passed.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
        hidden_size (`int`, *optional*, defaults to 128):
            Intermediate representation dimension.
        num_filters (`int`, *optional*, defaults to 32):
            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
        num_residual_layers (`int`,  *optional*, defaults to 1):
            Number of residual layers.
        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the initial convolution.
        last_kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the last convolution layer.
        residual_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the residual layers.
        dilation_growth_rate (`int`, *optional*, defaults to 2):
            How much to increase the dilation with each layer.
        use_causal_conv (`bool`, *optional*, defaults to `True`):
            Whether to use fully causal convolution.
        pad_mode (`str`, *optional*, defaults to `"reflect"`):
            Padding mode for the convolutions.
        compress (`int`, *optional*, defaults to 2):
            Reduced dimensionality in residual branches (from Demucs v3).
        num_lstm_layers (`int`, *optional*, defaults to 2):
            Number of LSTM layers at the end of the encoder.
        trim_right_ratio (`float`, *optional*, defaults to 1.0):
            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
            equal to 1.0, it means that all the trimming is done at the right.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of discret codes that make up VQVAE.
        codebook_dim (`int`, *optional*):
            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
            an identity function will be used, giving a generic residual connection.

    Example:
        ```python
        >>> from transformers import EncodecModel, EncodecConfig
        ...
        >>> # Initializing a "facebook/encodec_24khz" style configuration
        >>> configuration = EncodecConfig()
        ...
        >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
        >>> model = EncodecModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "encodec"
    #pylint: disable=W0102
    def __init__(
        self,
        target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
        sampling_rate=24_000,
        audio_channels=1,
        normalize=False,
        chunk_length_s=None,
        overlap=None,
        hidden_size=128,
        num_filters=32,
        num_residual_layers=1,
        upsampling_ratios = [8, 5, 4, 2],
        norm_type="weight_norm",
        kernel_size=7,
        last_kernel_size=7,
        residual_kernel_size=3,
        dilation_growth_rate=2,
        use_causal_conv=True,
        pad_mode="reflect",
        compress=2,
        num_lstm_layers=2,
        trim_right_ratio=1.0,
        codebook_size=1024,
        codebook_dim=None,
        use_conv_shortcut=True,
        **kwargs,
    ):
        """
        Initializes an instance of the EncodecConfig class.

        Args:
            self: The instance of the class.
            target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
            sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
            audio_channels (int): The number of audio channels. Default is 1.
            normalize (bool): Flag indicating whether to normalize the audio. Default is False.
            chunk_length_s (float): The length of audio chunks in seconds. Default is None.
            overlap (float): The overlap ratio between audio chunks. Default is None.
            hidden_size (int): The size of the hidden state in the model. Default is 128.
            num_filters (int): The number of filters in the model. Default is 32.
            num_residual_layers (int): The number of residual layers in the model. Default is 1.
            upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
            norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
            kernel_size (int): The size of the convolutional kernel. Default is 7.
            last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
            residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
            dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
            use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
            pad_mode (str): The padding mode for convolution. Default is 'reflect'.
            compress (int): The compression factor for audio. Default is 2.
            num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
            trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
            codebook_size (int): The size of the codebook. Default is 1024.
            codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
            use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

        Returns:
            None.

        Raises:
            ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

        """
        self.target_bandwidths = target_bandwidths
        self.sampling_rate = sampling_rate
        self.audio_channels = audio_channels
        self.normalize = normalize
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap
        self.hidden_size = hidden_size
        self.num_filters = num_filters
        self.num_residual_layers = num_residual_layers
        self.upsampling_ratios = upsampling_ratios
        self.norm_type = norm_type
        self.kernel_size = kernel_size
        self.last_kernel_size = last_kernel_size
        self.residual_kernel_size = residual_kernel_size
        self.dilation_growth_rate = dilation_growth_rate
        self.use_causal_conv = use_causal_conv
        self.pad_mode = pad_mode
        self.compress = compress
        self.num_lstm_layers = num_lstm_layers
        self.trim_right_ratio = trim_right_ratio
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
        self.use_conv_shortcut = use_conv_shortcut

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        super().__init__(**kwargs)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        chunk_length
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        chunk_stride
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    @property
    def frame_rate(self) -> int:
        r"""
        frame_rate
        """
        hop_length = np.prod(self.upsampling_ratios)
        return math.ceil(self.sampling_rate / hop_length)

    @property
    def num_quantizers(self) -> int:
        r"""
        num_quantizers
        """
        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * 10))

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_length: Optional[int] property

chunk_length

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_stride: Optional[int] property

chunk_stride

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.frame_rate: int property

frame_rate

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.num_quantizers: int property

num_quantizers

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.__init__(target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0], sampling_rate=24000, audio_channels=1, normalize=False, chunk_length_s=None, overlap=None, hidden_size=128, num_filters=32, num_residual_layers=1, upsampling_ratios=[8, 5, 4, 2], norm_type='weight_norm', kernel_size=7, last_kernel_size=7, residual_kernel_size=3, dilation_growth_rate=2, use_causal_conv=True, pad_mode='reflect', compress=2, num_lstm_layers=2, trim_right_ratio=1.0, codebook_size=1024, codebook_dim=None, use_conv_shortcut=True, **kwargs)

Initializes an instance of the EncodecConfig class.

PARAMETER DESCRIPTION
self

The instance of the class.

target_bandwidths

List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].

TYPE: list[float] DEFAULT: [1.5, 3.0, 6.0, 12.0, 24.0]

sampling_rate

The audio sampling rate in Hz. Default is 24000.

TYPE: int DEFAULT: 24000

audio_channels

The number of audio channels. Default is 1.

TYPE: int DEFAULT: 1

normalize

Flag indicating whether to normalize the audio. Default is False.

TYPE: bool DEFAULT: False

chunk_length_s

The length of audio chunks in seconds. Default is None.

TYPE: float DEFAULT: None

overlap

The overlap ratio between audio chunks. Default is None.

TYPE: float DEFAULT: None

hidden_size

The size of the hidden state in the model. Default is 128.

TYPE: int DEFAULT: 128

num_filters

The number of filters in the model. Default is 32.

TYPE: int DEFAULT: 32

num_residual_layers

The number of residual layers in the model. Default is 1.

TYPE: int DEFAULT: 1

upsampling_ratios

List of upsampling ratios. Default is [8, 5, 4, 2].

TYPE: list[int] DEFAULT: [8, 5, 4, 2]

norm_type

The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.

TYPE: str DEFAULT: 'weight_norm'

kernel_size

The size of the convolutional kernel. Default is 7.

TYPE: int DEFAULT: 7

last_kernel_size

The size of the last convolutional kernel. Default is 7.

TYPE: int DEFAULT: 7

residual_kernel_size

The size of the residual convolutional kernel. Default is 3.

TYPE: int DEFAULT: 3

dilation_growth_rate

The growth rate of dilation in the model. Default is 2.

TYPE: int DEFAULT: 2

use_causal_conv

Flag indicating whether to use causal convolution. Default is True.

TYPE: bool DEFAULT: True

pad_mode

The padding mode for convolution. Default is 'reflect'.

TYPE: str DEFAULT: 'reflect'

compress

The compression factor for audio. Default is 2.

TYPE: int DEFAULT: 2

num_lstm_layers

The number of LSTM layers in the model. Default is 2.

TYPE: int DEFAULT: 2

trim_right_ratio

The ratio of trimming audio from the right. Default is 1.0.

TYPE: float DEFAULT: 1.0

codebook_size

The size of the codebook. Default is 1024.

TYPE: int DEFAULT: 1024

codebook_dim

The dimension of the codebook. Default is equal to hidden_size if not provided.

TYPE: int DEFAULT: None

use_conv_shortcut

Flag indicating whether to use convolution shortcut. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If norm_type is not 'weight_norm' or 'time_group_norm'.

Source code in mindnlp\transformers\models\encodec\configuration_encodec.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def __init__(
    self,
    target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
    sampling_rate=24_000,
    audio_channels=1,
    normalize=False,
    chunk_length_s=None,
    overlap=None,
    hidden_size=128,
    num_filters=32,
    num_residual_layers=1,
    upsampling_ratios = [8, 5, 4, 2],
    norm_type="weight_norm",
    kernel_size=7,
    last_kernel_size=7,
    residual_kernel_size=3,
    dilation_growth_rate=2,
    use_causal_conv=True,
    pad_mode="reflect",
    compress=2,
    num_lstm_layers=2,
    trim_right_ratio=1.0,
    codebook_size=1024,
    codebook_dim=None,
    use_conv_shortcut=True,
    **kwargs,
):
    """
    Initializes an instance of the EncodecConfig class.

    Args:
        self: The instance of the class.
        target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
        sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
        audio_channels (int): The number of audio channels. Default is 1.
        normalize (bool): Flag indicating whether to normalize the audio. Default is False.
        chunk_length_s (float): The length of audio chunks in seconds. Default is None.
        overlap (float): The overlap ratio between audio chunks. Default is None.
        hidden_size (int): The size of the hidden state in the model. Default is 128.
        num_filters (int): The number of filters in the model. Default is 32.
        num_residual_layers (int): The number of residual layers in the model. Default is 1.
        upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
        norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
        kernel_size (int): The size of the convolutional kernel. Default is 7.
        last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
        residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
        dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
        use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
        pad_mode (str): The padding mode for convolution. Default is 'reflect'.
        compress (int): The compression factor for audio. Default is 2.
        num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
        trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
        codebook_size (int): The size of the codebook. Default is 1024.
        codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
        use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

    Returns:
        None.

    Raises:
        ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

    """
    self.target_bandwidths = target_bandwidths
    self.sampling_rate = sampling_rate
    self.audio_channels = audio_channels
    self.normalize = normalize
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap
    self.hidden_size = hidden_size
    self.num_filters = num_filters
    self.num_residual_layers = num_residual_layers
    self.upsampling_ratios = upsampling_ratios
    self.norm_type = norm_type
    self.kernel_size = kernel_size
    self.last_kernel_size = last_kernel_size
    self.residual_kernel_size = residual_kernel_size
    self.dilation_growth_rate = dilation_growth_rate
    self.use_causal_conv = use_causal_conv
    self.pad_mode = pad_mode
    self.compress = compress
    self.num_lstm_layers = num_lstm_layers
    self.trim_right_ratio = trim_right_ratio
    self.codebook_size = codebook_size
    self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
    self.use_conv_shortcut = use_conv_shortcut

    if self.norm_type not in ["weight_norm", "time_group_norm"]:
        raise ValueError(
            f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
        )

    super().__init__(**kwargs)

mindnlp.transformers.models.encodec.feature_extraction_encodec

Feature extractor class for EnCodec.

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor

Bases: SequenceFeatureExtractor

Constructs an EnCodec feature extractor.

This feature extractor inherits from [~feature_extraction_sequence_utils.SequenceFeatureExtractor] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

Instantiating a feature extractor with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

PARAMETER DESCRIPTION
feature_size

The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

sampling_rate

The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).

TYPE: `int`, *optional*, defaults to 24000 DEFAULT: 24000

padding_value

The value that is used to fill the padding values.

TYPE: `float`, *optional*, defaults to 0.0 DEFAULT: 0.0

chunk_length_s

If defined the audio is pre-processed into chunks of lengths chunk_length_s and then encoded.

TYPE: `float`, *optional* DEFAULT: None

overlap

Defines the overlap between each chunk. It is used to compute the chunk_stride using the following formulae : int((1.0 - self.overlap) * self.chunk_length).

TYPE: `float`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class EncodecFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs an EnCodec feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
    """
    model_input_names = ["input_values", "padding_mask"]

    def __init__(
        self,
        feature_size: int = 1,
        sampling_rate: int = 24000,
        padding_value: float = 0.0,
        chunk_length_s: float = None,
        overlap: float = None,
        **kwargs,
    ):
        """
        Initialize the EncodecFeatureExtractor class with the given parameters.

        Args:
            self: The instance of the class.
            feature_size (int): The size of the feature. Default is 1.
            sampling_rate (int): The sampling rate in Hz. Default is 24000.
            padding_value (float): The value used for padding. Default is 0.0.
            chunk_length_s (float): The length of each chunk in seconds. Default is None.
            overlap (float): The overlap between chunks in seconds. Default is None.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap

    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    def __call__(
        self,
        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
    ) -> BatchFeature:
        """
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
                (`feature_size = 2`).
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                lengths).
            truncation (`bool`, *optional*, defaults to `False`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
        """
        if sampling_rate is not None:
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                    f" {self.sampling_rate} and not {sampling_rate}."
                )
        else:
            logger.warning(
                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
                "Failing to do so can result in silent errors that might be hard to debug."
            )

        if padding and truncation:
            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
        if padding is None:
            # by default let's pad the inputs
            padding = True

        is_batched = bool(
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if is_batched:
            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
        elif not is_batched and not isinstance(raw_audio, np.ndarray):
            raw_audio = np.asarray(raw_audio, dtype=np.float32)
        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
            raw_audio = raw_audio.astype(np.float32)

        # always return batch
        if not is_batched:
            raw_audio = [np.asarray(raw_audio).T]

        # verify inputs are valid
        for _, example in enumerate(raw_audio):
            if example.ndim > 2:
                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
            if self.feature_size == 1 and example.ndim != 1:
                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
            if self.feature_size == 2 and example.shape[-1] != 2:
                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

        padded_inputs = None
        input_values = BatchFeature({"input_values": raw_audio})
        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
            if truncation:
                max_length = min(array.shape[0] for array in raw_audio)
                nb_step = int(np.floor(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            elif padding:
                max_length = max(array.shape[0] for array in raw_audio)
                nb_step = int(np.ceil(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
                padding = "max_length"
            else:
                padded_inputs = input_values

        # normal padding on batch
        if padded_inputs is None:
            padded_inputs = self.pad(
                input_values,
                max_length=max_length,
                truncation=truncation,
                padding=padding,
                return_attention_mask=padding,
            )
            if padding:
                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

        input_values = []
        for example in padded_inputs.pop("input_values"):
            if self.feature_size == 1:
                example = example[..., None]
            input_values.append(example.T)

        padded_inputs["input_values"] = input_values
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

        return padded_inputs

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_length: Optional[int] property

This is a property because you might want to change the chunk_length_s on the fly

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_stride: Optional[int] property

This is a property because you might want to change the chunk_length_s on the fly

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__call__(raw_audio, padding=None, truncation=False, max_length=None, return_tensors=None, sampling_rate=None)

Main method to featurize and prepare for the model one or several sequence(s).

PARAMETER DESCRIPTION
raw_audio

The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape (num_samples,) for mono audio (feature_size = 1), or (2, num_samples) for stereo audio (feature_size = 2).

TYPE: `np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`

padding

Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among:

  • True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
  • 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
  • False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).

TYPE: `bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True` DEFAULT: None

truncation

Activates truncation to cut input sequences longer than max_length to max_length.

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

max_length

Maximum length of the returned list and optionally padding length (see above).

TYPE: `int`, *optional* DEFAULT: None

return_tensors

If set, will return tensors instead of list of python integers. Acceptable values are:

  • 'tf': Return TensorFlow tf.constant objects.
  • 'pt': Return PyTorch torch.Tensor objects.
  • 'np': Return Numpy np.ndarray objects.

TYPE: `str` or [`~utils.TensorType`], *optional* DEFAULT: None

sampling_rate

The sampling rate at which the audio input was sampled. It is strongly recommended to pass sampling_rate at the forward call to prevent silent errors.

TYPE: `int`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def __call__(
    self,
    raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
    padding: Optional[Union[bool, str, PaddingStrategy]] = None,
    truncation: Optional[bool] = False,
    max_length: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    sampling_rate: Optional[int] = None,
) -> BatchFeature:
    """
    Main method to featurize and prepare for the model one or several sequence(s).

    Args:
        raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
            The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
            values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
            `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
            (`feature_size = 2`).
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding
            index) among:

            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
            sequence if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
            acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
            lengths).
        truncation (`bool`, *optional*, defaults to `False`):
            Activates truncation to cut input sequences longer than `max_length` to `max_length`.
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        return_tensors (`str` or [`~utils.TensorType`], *optional*):
            If set, will return tensors instead of list of python integers. Acceptable values are:

            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return Numpy `np.ndarray` objects.
        sampling_rate (`int`, *optional*):
            The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
            `sampling_rate` at the forward call to prevent silent errors.
    """
    if sampling_rate is not None:
        if sampling_rate != self.sampling_rate:
            raise ValueError(
                f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                f" {self.sampling_rate} and not {sampling_rate}."
            )
    else:
        logger.warning(
            "It is strongly recommended to pass the `sampling_rate` argument to this function. "
            "Failing to do so can result in silent errors that might be hard to debug."
        )

    if padding and truncation:
        raise ValueError("Both padding and truncation were set. Make sure you only set one.")
    if padding is None:
        # by default let's pad the inputs
        padding = True

    is_batched = bool(
        isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
    )

    if is_batched:
        raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
    elif not is_batched and not isinstance(raw_audio, np.ndarray):
        raw_audio = np.asarray(raw_audio, dtype=np.float32)
    elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
        raw_audio = raw_audio.astype(np.float32)

    # always return batch
    if not is_batched:
        raw_audio = [np.asarray(raw_audio).T]

    # verify inputs are valid
    for _, example in enumerate(raw_audio):
        if example.ndim > 2:
            raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
        if self.feature_size == 1 and example.ndim != 1:
            raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
        if self.feature_size == 2 and example.shape[-1] != 2:
            raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

    padded_inputs = None
    input_values = BatchFeature({"input_values": raw_audio})
    if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
        if truncation:
            max_length = min(array.shape[0] for array in raw_audio)
            nb_step = int(np.floor(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
        elif padding:
            max_length = max(array.shape[0] for array in raw_audio)
            nb_step = int(np.ceil(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            padding = "max_length"
        else:
            padded_inputs = input_values

    # normal padding on batch
    if padded_inputs is None:
        padded_inputs = self.pad(
            input_values,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_attention_mask=padding,
        )
        if padding:
            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

    input_values = []
    for example in padded_inputs.pop("input_values"):
        if self.feature_size == 1:
            example = example[..., None]
        input_values.append(example.T)

    padded_inputs["input_values"] = input_values
    if return_tensors is not None:
        padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

    return padded_inputs

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__init__(feature_size=1, sampling_rate=24000, padding_value=0.0, chunk_length_s=None, overlap=None, **kwargs)

Initialize the EncodecFeatureExtractor class with the given parameters.

PARAMETER DESCRIPTION
self

The instance of the class.

feature_size

The size of the feature. Default is 1.

TYPE: int DEFAULT: 1

sampling_rate

The sampling rate in Hz. Default is 24000.

TYPE: int DEFAULT: 24000

padding_value

The value used for padding. Default is 0.0.

TYPE: float DEFAULT: 0.0

chunk_length_s

The length of each chunk in seconds. Default is None.

TYPE: float DEFAULT: None

overlap

The overlap between chunks in seconds. Default is None.

TYPE: float DEFAULT: None

**kwargs

Additional keyword arguments.

DEFAULT: {}

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    feature_size: int = 1,
    sampling_rate: int = 24000,
    padding_value: float = 0.0,
    chunk_length_s: float = None,
    overlap: float = None,
    **kwargs,
):
    """
    Initialize the EncodecFeatureExtractor class with the given parameters.

    Args:
        self: The instance of the class.
        feature_size (int): The size of the feature. Default is 1.
        sampling_rate (int): The sampling rate in Hz. Default is 24000.
        padding_value (float): The value used for padding. Default is 0.0.
        chunk_length_s (float): The length of each chunk in seconds. Default is None.
        overlap (float): The overlap between chunks in seconds. Default is None.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap