encodec

`mindnlp.transformers.models.encodec.modeling_encodec` ¶

PyTorch EnCodec model.

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d` ¶

Bases: Module

Conv1d with asymmetric or causal padding and normalization.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecConv1d(nn.Module):
    """Conv1d with asymmetric or causal padding and normalization."""

    def __init__(
        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
    ):
        super().__init__()
        self.causal = config.use_causal_conv
        self.pad_mode = config.pad_mode
        self.norm_type = config.norm_type

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        # warn user on unusual setup between dilation and stride
        if stride > 1 and dilation > 1:
            logger.warning(
                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
            )

        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
        if self.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)
        elif self.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

        kernel_size = self.conv.kernel_size[0]
        stride = mindspore.tensor(self.conv.stride[0], dtype=mindspore.int64)
        dilation = self.conv.dilation[0]

        # Effective kernel size with dilations.
        kernel_size = mindspore.tensor((kernel_size - 1) * dilation + 1, dtype=mindspore.int64)

        self.register_buffer("stride", stride, persistent=False)
        self.register_buffer("kernel_size", kernel_size, persistent=False)
        self.register_buffer("padding_total", mindspore.tensor(kernel_size - stride, dtype=mindspore.int64), persistent=False)

    def _get_extra_padding_for_conv1d(
        self,
        hidden_states: mindspore.Tensor,
    ) -> mindspore.Tensor:
        """See `pad_for_conv1d`."""
        length = hidden_states.shape[-1]
        n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
        n_frames = ops.ceil(n_frames).to(mindspore.int64) - 1
        ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total

        return ideal_length - length

    @staticmethod
    def _pad1d(hidden_states: mindspore.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
        """Tiny wrapper around nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        """
        length = hidden_states.shape[-1]
        padding_left, padding_right = paddings
        if mode != 'reflect':
            return nn.functional.pad(hidden_states, paddings, mode, value)

        max_pad = max(padding_left, padding_right)
        extra_pad = 0
        if length <= max_pad:
            extra_pad = max_pad - length + 1
            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
        padded = nn.functional.pad(hidden_states, paddings, mode, value)
        end = padded.shape[-1] - extra_pad
        return padded[..., :end]

    def forward(self, hidden_states):
        extra_padding = self._get_extra_padding_for_conv1d(hidden_states).item()

        if self.causal:
            # Left padding for causal
            hidden_states = self._pad1d(hidden_states, (self.padding_total.item(), extra_padding), mode=self.pad_mode)
        else:
            # Asymmetric padding required for odd strides
            padding_right = self.padding_total.item() // 2
            padding_left = self.padding_total.item() - padding_right
            hidden_states = self._pad1d(
                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
            )

        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        return hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d` ¶

Bases: Module

ConvTranspose1d with asymmetric or causal padding and normalization.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecConvTranspose1d(nn.Module):
    """ConvTranspose1d with asymmetric or causal padding and normalization."""

    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
        super().__init__()
        self.causal = config.use_causal_conv
        self.trim_right_ratio = config.trim_right_ratio
        self.norm_type = config.norm_type
        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
        if config.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)
        elif config.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

        if not (self.causal or self.trim_right_ratio == 1.0):
            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")

    def forward(self, hidden_states):
        kernel_size = self.conv.kernel_size[0]
        stride = self.conv.stride[0]
        padding_total = kernel_size - stride

        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
        # removed at the very end, when keeping only the right length for the output,
        # as removing it here would require also passing the length at the matching layer
        # in the encoder.
        if self.causal:
            # Trim the padding on the right according to the specified ratio
            # if trim_right_ratio = 1.0, trim everything from right
            padding_right = math.ceil(padding_total * self.trim_right_ratio)
        else:
            # Asymmetric padding required for odd strides
            padding_right = padding_total // 2

        padding_left = padding_total - padding_right

        # unpad
        end = hidden_states.shape[-1] - padding_right
        hidden_states = hidden_states[..., padding_left:end]
        return hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder` ¶

Bases: Module

SEANet decoder as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecDecoder(nn.Module):
    """SEANet decoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        scaling = int(2 ** len(config.upsampling_ratios))
        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]

        model += [EncodecLSTM(config, scaling * config.num_filters)]

        # Upsample to raw audio scale
        for ratio in config.upsampling_ratios:
            current_scale = scaling * config.num_filters
            # Add upsampling layers
            model += [nn.ELU()]
            model += [
                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
            ]
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
            scaling //= 2

        # Add final layers
        model += [nn.ELU()]
        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoderOutput` `dataclass` ¶

Bases: ModelOutput

PARAMETER	DESCRIPTION
`audio_values`	Decoded audio values, obtained using the decoder part of Encodec. TYPE: `mindspore.Tensor` of shape `(batch_size, segment_length)`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

@dataclass
class EncodecDecoderOutput(ModelOutput):
    """
    Args:
        audio_values (`mindspore.Tensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Encodec.
    """

    audio_values: mindspore.Tensor = None

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder` ¶

Bases: Module

SEANet encoder as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecEncoder(nn.Module):
    """SEANet encoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
        scaling = 1

        # Downsample to raw audio scale
        for ratio in reversed(config.upsampling_ratios):
            current_scale = scaling * config.num_filters
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
            # Add downsampling layers
            model += [nn.ELU()]
            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
            scaling *= 2

        model += [EncodecLSTM(config, scaling * config.num_filters)]
        model += [nn.ELU()]
        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]

        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoderOutput` `dataclass` ¶

Bases: ModelOutput

PARAMETER	DESCRIPTION
`audio_codes`	Discret code embeddings computed using `model.encode`. TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, optional DEFAULT: `None`
`audio_scales`	Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding. TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

@dataclass
class EncodecEncoderOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    """

    audio_codes: mindspore.Tensor = None
    audio_scales: mindspore.Tensor = None

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook` ¶

Bases: Module

Codebook with Euclidean distance.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecEuclideanCodebook(nn.Module):
    """Codebook with Euclidean distance."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        embed = ops.zeros(config.codebook_size, config.codebook_dim)

        self.codebook_size = config.codebook_size

        self.register_buffer("inited", mindspore.Tensor([True]))
        self.register_buffer("cluster_size", ops.zeros(config.codebook_size))
        self.register_buffer("embed", embed)
        self.register_buffer("embed_avg", embed.copy())

    def quantize(self, hidden_states):
        embed = self.embed.t()
        scaled_states = ops.sum(hidden_states.pow(2), 1, keepdim=True)
        dist = -(scaled_states - 2 * hidden_states @ embed + ops.sum(embed.pow(2), 0, keepdim=True))
        embed_ind = ops.max(dist, dim=-1)[1]
        return embed_ind

    def encode(self, hidden_states):
        shape = hidden_states.shape
        # pre-process
        hidden_states = hidden_states.reshape((-1, shape[-1]))
        # quantize
        embed_ind = self.quantize(hidden_states)
        # post-process
        embed_ind = embed_ind.view(*shape[:-1])
        return embed_ind

    def decode(self, embed_ind):
        quantize = nn.functional.embedding(embed_ind, self.embed)
        return quantize

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM` ¶

Bases: Module

LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecLSTM(nn.Module):
    """
    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
    """

    def __init__(self, config, dimension):
        super().__init__()
        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)

    def forward(self, hidden_states):
        hidden_states = hidden_states.permute(2, 0, 1)
        hidden_states = self.lstm(hidden_states)[0] + hidden_states
        hidden_states = hidden_states.permute(1, 2, 0)
        return hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel` ¶

Bases: EncodecPreTrainedModel

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecModel(EncodecPreTrainedModel):
    def __init__(self, config: EncodecConfig):
        super().__init__(config)
        self.config = config

        self.encoder = EncodecEncoder(config)
        self.decoder = EncodecDecoder(config)

        self.quantizer = EncodecResidualVectorQuantizer(config)

        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
        if 2**self.bits_per_codebook != self.config.codebook_size:
            raise ValueError("The codebook_size must be a power of 2.")

        # Initialize weights and apply final processing
        self.post_init()

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def _encode_frame(
        self, input_values: mindspore.Tensor, bandwidth: float, padding_mask: int
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]:
        """
        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
        normalized. The padding mask is required to compute the correct scale.
        """
        length = input_values.shape[-1]
        duration = length / self.config.sampling_rate

        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")

        scale = None
        if self.config.normalize:
            # if the padding is non zero
            input_values = input_values * padding_mask
            mono = ops.sum(input_values, 1, keepdim=True) / input_values.shape[1]
            scale = ops.mean(mono.pow(2), dim=-1, keepdim=True).sqrt() + 1e-8
            input_values = input_values / scale

        embeddings = self.encoder(input_values)
        codes = self.quantizer.encode(embeddings, bandwidth)
        codes = ops.transpose(codes, 0, 1)
        return codes, scale

    def encode(
        self,
        input_values: mindspore.Tensor,
        padding_mask: mindspore.Tensor = None,
        bandwidth: Optional[float] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
        """
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            bandwidth (`float`, *optional*):
                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
                as bandwidth == 6.0

        Returns:
            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
            `codebook` of shape `[batch_size, num_codebooks, frames]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if bandwidth is None:
            bandwidth = self.config.target_bandwidths[0]
        if bandwidth not in self.config.target_bandwidths:
            raise ValueError(
                f"This model doesn't support the bandwidth {bandwidth}. "
                f"Select one of {self.config.target_bandwidths}."
            )

        _, channels, input_length = input_values.shape

        if channels < 1 or channels > 2:
            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            chunk_length = input_length
            stride = input_length
        else:
            stride = self.config.chunk_stride

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        encoded_frames = []
        scales = []

        step = chunk_length - stride
        if (input_length % stride) - step != 0:
            raise ValueError(
                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
            )

        for offset in range(0, input_length - step, stride):
            mask = padding_mask[..., offset : offset + chunk_length].bool()
            frame = input_values[:, :, offset : offset + chunk_length]
            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
            encoded_frames.append(encoded_frame)
            scales.append(scale)

        encoded_frames = ops.stack(encoded_frames)

        if not return_dict:
            return (encoded_frames, scales)

        return EncodecEncoderOutput(encoded_frames, scales)

    @staticmethod
    def _linear_overlap_add(frames: List[mindspore.Tensor], stride: int):
        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
        # e.g., more than 2 frames per position.
        # The core idea is to use a weight function that is a triangle,
        # with a maximum value at the middle of the chunk.
        # We use this weighting when summing the frames, and divide by the sum of weights
        # for each positions at the end. Thus:
        #   - if a frame is the only one to cover a position, the weighting is a no-op.
        #   - if 2 frames cover a position:
        #          ...  ...
        #         /   \/   \
        #        /    /\    \
        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
        # After the final normalization, the weight of the second frame at position `t` is
        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
        #
        #   - if more than 2 frames overlap at a given point, we hope that by induction
        #      something sensible happens.
        if len(frames) == 0:
            raise ValueError("`frames` cannot be an empty list.")

        dtype = frames[0].dtype
        shape = frames[0].shape[:-1]
        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]

        frame_length = frames[0].shape[-1]
        time_vec = ops.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
        weight = 0.5 - (time_vec - 0.5).abs()

        sum_weight = ops.zeros(total_size, dtype=dtype)
        out = ops.zeros(*shape, total_size, dtype=dtype)
        offset: int = 0

        for frame in frames:
            frame_length = frame.shape[-1]
            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
            sum_weight[offset : offset + frame_length] += weight[:frame_length]
            offset += stride

        if sum_weight.min() == 0:
            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")

        return out / sum_weight

    def _decode_frame(self, codes: mindspore.Tensor, scale: Optional[mindspore.Tensor] = None) -> mindspore.Tensor:
        codes = ops.transpose(codes, 0, 1)
        embeddings = self.quantizer.decode(codes)
        outputs = self.decoder(embeddings)
        if scale is not None:
            outputs = outputs * scale.view(-1, 1, 1)
        return outputs

    def decode(
        self,
        audio_codes: mindspore.Tensor,
        audio_scales: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
        """
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            if len(audio_codes) != 1:
                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
        else:
            decoded_frames = []

            for frame, scale in zip(audio_codes, audio_scales):
                frames = self._decode_frame(frame, scale)
                decoded_frames.append(frames)

            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

        # truncate based on padding mask
        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
            audio_values = audio_values[..., : padding_mask.shape[-1]]

        if not return_dict:
            return (audio_values,)
        return EncodecDecoderOutput(audio_values)

    def forward(
        self,
        input_values: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        bandwidth: Optional[float] = None,
        audio_codes: Optional[mindspore.Tensor] = None,
        audio_scales: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
        r"""
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, EncodecModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model_id = "facebook/encodec_24khz"
        >>> model = EncodecModel.from_pretrained(model_id)
        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        if audio_codes is not None and audio_scales is None:
            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

        if audio_scales is not None and audio_codes is None:
            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

        if audio_scales is None and audio_codes is None:
            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
        if not return_dict:
            return (audio_codes, audio_values)

        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.decode(audio_codes, audio_scales, padding_mask=None, return_dict=None)` ¶

Decodes the given frames into an output audio waveform.

Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be trimmed.

PARAMETER	DESCRIPTION
`audio_codes`	Discret code embeddings computed using `model.encode`. TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, optional
`audio_scales`	Scaling factor for each `audio_codes` input. TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, optional
`padding_mask`	Padding mask used to pad the `input_values`. TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: `None`
`return_dict`	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. TYPE: `bool`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def decode(
    self,
    audio_codes: mindspore.Tensor,
    audio_scales: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
    """
    Decodes the given frames into an output audio waveform.

    Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
    trimmed.

    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

    """
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        if len(audio_codes) != 1:
            raise ValueError(f"Expected one frame, got {len(audio_codes)}")
        audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
    else:
        decoded_frames = []

        for frame, scale in zip(audio_codes, audio_scales):
            frames = self._decode_frame(frame, scale)
            decoded_frames.append(frames)

        audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

    # truncate based on padding mask
    if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
        audio_values = audio_values[..., : padding_mask.shape[-1]]

    if not return_dict:
        return (audio_values,)
    return EncodecDecoderOutput(audio_values)

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.encode(input_values, padding_mask=None, bandwidth=None, return_dict=None)` ¶

Encodes the input audio waveform into discrete codes.

PARAMETER	DESCRIPTION
`input_values`	Float values of the input audio waveform. TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`
`padding_mask`	Padding mask used to pad the `input_values`. TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: `None`
`bandwidth`	The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as bandwidth == 6.0 TYPE: `float`, optional DEFAULT: `None`

RETURNS	DESCRIPTION
`Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]`	A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
`Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]`	factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
`Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]`	`codebook` of shape `[batch_size, num_codebooks, frames]`.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def encode(
    self,
    input_values: mindspore.Tensor,
    padding_mask: mindspore.Tensor = None,
    bandwidth: Optional[float] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
    """
    Encodes the input audio waveform into discrete codes.

    Args:
        input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Float values of the input audio waveform.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        bandwidth (`float`, *optional*):
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
            as bandwidth == 6.0

    Returns:
        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
        `codebook` of shape `[batch_size, num_codebooks, frames]`.
    """
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    if bandwidth is None:
        bandwidth = self.config.target_bandwidths[0]
    if bandwidth not in self.config.target_bandwidths:
        raise ValueError(
            f"This model doesn't support the bandwidth {bandwidth}. "
            f"Select one of {self.config.target_bandwidths}."
        )

    _, channels, input_length = input_values.shape

    if channels < 1 or channels > 2:
        raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        chunk_length = input_length
        stride = input_length
    else:
        stride = self.config.chunk_stride

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    encoded_frames = []
    scales = []

    step = chunk_length - stride
    if (input_length % stride) - step != 0:
        raise ValueError(
            "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
        )

    for offset in range(0, input_length - step, stride):
        mask = padding_mask[..., offset : offset + chunk_length].bool()
        frame = input_values[:, :, offset : offset + chunk_length]
        encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
        encoded_frames.append(encoded_frame)
        scales.append(scale)

    encoded_frames = ops.stack(encoded_frames)

    if not return_dict:
        return (encoded_frames, scales)

    return EncodecEncoderOutput(encoded_frames, scales)

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.forward(input_values, padding_mask=None, bandwidth=None, audio_codes=None, audio_scales=None, return_dict=None)` ¶

Examples:

>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, EncodecModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model_id = "facebook/encodec_24khz"
>>> model = EncodecModel.from_pretrained(model_id)
>>> processor = AutoProcessor.from_pretrained(model_id)

>>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def forward(
    self,
    input_values: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    bandwidth: Optional[float] = None,
    audio_codes: Optional[mindspore.Tensor] = None,
    audio_scales: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
    r"""
    Returns:

    Examples:

    ```python
    >>> from datasets import load_dataset
    >>> from transformers import AutoProcessor, EncodecModel

    >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
    >>> audio_sample = dataset["train"]["audio"][0]["array"]

    >>> model_id = "facebook/encodec_24khz"
    >>> model = EncodecModel.from_pretrained(model_id)
    >>> processor = AutoProcessor.from_pretrained(model_id)

    >>> inputs = processor(raw_audio=audio_sample, return_tensors="ms")

    >>> outputs = model(**inputs)
    >>> audio_codes = outputs.audio_codes
    >>> audio_values = outputs.audio_values
    ```"""
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    if audio_codes is not None and audio_scales is None:
        raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

    if audio_scales is not None and audio_codes is None:
        raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

    if audio_scales is None and audio_codes is None:
        audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

    audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
    if not return_dict:
        return (audio_codes, audio_values)

    return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecOutput` `dataclass` ¶

Bases: ModelOutput

PARAMETER	DESCRIPTION
`audio_codes`	Discret code embeddings computed using `model.encode`. TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

@dataclass
class EncodecOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    """

    audio_codes: mindspore.Tensor = None
    audio_values: mindspore.Tensor = None

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel` ¶

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = EncodecConfig
    base_model_prefix = "encodec"
    main_input_name = "input_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    nn.init.constant_(param, 0.0)

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer` ¶

Bases: Module

Residual Vector Quantizer.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecResidualVectorQuantizer(nn.Module):
    """Residual Vector Quantizer."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        self.codebook_size = config.codebook_size
        self.frame_rate = config.frame_rate
        self.num_quantizers = config.num_quantizers
        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])

    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
        """Return num_quantizers based on specified target bandwidth."""
        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
        num_quantizers = self.num_quantizers
        if bandwidth is not None and bandwidth > 0.0:
            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
        return num_quantizers

    def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
        """
        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        """
        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
        residual = embeddings
        all_indices = []
        for layer in self.layers[:num_quantizers]:
            indices = layer.encode(residual)
            quantized = layer.decode(indices)
            residual = residual - quantized
            all_indices.append(indices)
        out_indices = ops.stack(all_indices)
        return out_indices

    def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
        """Decode the given codes to the quantized representation."""
        quantized_out = mindspore.tensor(0.0)
        for i, indices in enumerate(codes):
            layer = self.layers[i]
            quantized = layer.decode(indices)
            quantized_out = quantized_out.to(quantized.dtype) + quantized
        return quantized_out

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.decode(codes)` ¶

Decode the given codes to the quantized representation.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
    """Decode the given codes to the quantized representation."""
    quantized_out = mindspore.tensor(0.0)
    for i, indices in enumerate(codes):
        layer = self.layers[i]
        quantized = layer.decode(indices)
        quantized_out = quantized_out.to(quantized.dtype) + quantized
    return quantized_out

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.encode(embeddings, bandwidth=None)` ¶

Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizers to use and returns indices for each quantizer.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
    """
    Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
    the appropriate number of quantizers to use and returns indices for each quantizer.
    """
    num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
    residual = embeddings
    all_indices = []
    for layer in self.layers[:num_quantizers]:
        indices = layer.encode(residual)
        quantized = layer.decode(indices)
        residual = residual - quantized
        all_indices.append(indices)
    out_indices = ops.stack(all_indices)
    return out_indices

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth(bandwidth=None)` ¶

Return num_quantizers based on specified target bandwidth.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
    """Return num_quantizers based on specified target bandwidth."""
    bw_per_q = math.log2(self.codebook_size) * self.frame_rate
    num_quantizers = self.num_quantizers
    if bandwidth is not None and bandwidth > 0.0:
        num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
    return num_quantizers

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock` ¶

Bases: Module

Residual block from SEANet model as used by EnCodec.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecResnetBlock(nn.Module):
    """
    Residual block from SEANet model as used by EnCodec.
    """

    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
        super().__init__()
        kernel_sizes = (config.residual_kernel_size, 1)
        if len(kernel_sizes) != len(dilations):
            raise ValueError("Number of kernel sizes should match number of dilations")

        hidden = dim // config.compress
        block = []
        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
            in_chs = dim if i == 0 else hidden
            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
            block += [nn.ELU()]
            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
        self.block = nn.ModuleList(block)

        if config.use_conv_shortcut:
            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
        else:
            self.shortcut = nn.Identity()

    def forward(self, hidden_states):
        residual = hidden_states
        for layer in self.block:
            hidden_states = layer(hidden_states)

        return self.shortcut(residual) + hidden_states

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization` ¶

Bases: Module

Vector quantization implementation. Currently supports only euclidean distance.

Source code in mindnlp\transformers\models\encodec\modeling_encodec.py

class EncodecVectorQuantization(nn.Module):
    """
    Vector quantization implementation. Currently supports only euclidean distance.
    """

    def __init__(self, config: EncodecConfig):
        super().__init__()
        self.codebook = EncodecEuclideanCodebook(config)

    def encode(self, hidden_states):
        hidden_states = hidden_states.permute(0, 2, 1)
        embed_in = self.codebook.encode(hidden_states)
        return embed_in

    def decode(self, embed_ind):
        quantize = self.codebook.decode(embed_ind)
        quantize = quantize.permute(0, 2, 1)
        return quantize

`mindnlp.transformers.models.encodec.configuration_encodec` ¶

Encodec Model config

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig` ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of an [EncodecModel]. It is used to instantiate a Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER	DESCRIPTION
`target_bandwidths`	The range of diffent bandwiths the model can encode audio with. TYPE: `List[float]`, optional, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]` DEFAULT: `[1.5, 3.0, 6.0, 12.0, 24.0]`
`sampling_rate`	The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). TYPE: `int`, optional, defaults to 24000 DEFAULT: `24000`
`audio_channels`	Number of channels in the audio data. Either 1 for mono or 2 for stereo. TYPE: `int`, optional, defaults to 1 DEFAULT: `1`
`normalize`	Whether the audio shall be normalized when passed. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`chunk_length_s`	If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded. TYPE: `float`, optional DEFAULT: `None`
`overlap`	Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following formulae : `int((1.0 - self.overlap) * self.chunk_length)`. TYPE: `float`, optional DEFAULT: `None`
`hidden_size`	Intermediate representation dimension. TYPE: `int`, optional, defaults to 128 DEFAULT: `128`
`num_filters`	Number of convolution kernels of first `EncodecConv1d` down sampling layer. TYPE: `int`, optional, defaults to 32 DEFAULT: `32`
`num_residual_layers`	Number of residual layers. TYPE: `int`, optional, defaults to 1 DEFAULT: `1`
`upsampling_ratios`	Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here that must match the decoder order. TYPE: `Sequence[int]` , optional, defaults to `[8, 5, 4, 2]` DEFAULT: `[8, 5, 4, 2]`
`norm_type`	Normalization method. Should be in `["weight_norm", "time_group_norm"]` TYPE: `str`, optional, defaults to `"weight_norm"` DEFAULT: `'weight_norm'`
`kernel_size`	Kernel size for the initial convolution. TYPE: `int`, optional, defaults to 7 DEFAULT: `7`
`last_kernel_size`	Kernel size for the last convolution layer. TYPE: `int`, optional, defaults to 7 DEFAULT: `7`
`residual_kernel_size`	Kernel size for the residual layers. TYPE: `int`, optional, defaults to 3 DEFAULT: `3`
`dilation_growth_rate`	How much to increase the dilation with each layer. TYPE: `int`, optional, defaults to 2 DEFAULT: `2`
`use_causal_conv`	Whether to use fully causal convolution. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`pad_mode`	Padding mode for the convolutions. TYPE: `str`, optional, defaults to `"reflect"` DEFAULT: `'reflect'`
`compress`	Reduced dimensionality in residual branches (from Demucs v3). TYPE: `int`, optional, defaults to 2 DEFAULT: `2`
`num_lstm_layers`	Number of LSTM layers at the end of the encoder. TYPE: `int`, optional, defaults to 2 DEFAULT: `2`
`trim_right_ratio`	Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If equal to 1.0, it means that all the trimming is done at the right. TYPE: `float`, optional, defaults to 1.0 DEFAULT: `1.0`
`codebook_size`	Number of discret codes that make up VQVAE. TYPE: `int`, optional, defaults to 1024 DEFAULT: `1024`
`codebook_dim`	Dimension of the codebook vectors. If not defined, uses `hidden_size`. TYPE: `int`, optional DEFAULT: `None`
`use_conv_shortcut`	Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False, an identity function will be used, giving a generic residual connection. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`

Example

>>> from transformers import EncodecModel, EncodecConfig
...
>>> # Initializing a "facebook/encodec_24khz" style configuration
>>> configuration = EncodecConfig()
...
>>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
>>> model = EncodecModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in mindnlp\transformers\models\encodec\configuration_encodec.py

class EncodecConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
            The range of diffent bandwiths the model can encode audio with.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        audio_channels (`int`, *optional*, defaults to 1):
            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether the audio shall be normalized when passed.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
        hidden_size (`int`, *optional*, defaults to 128):
            Intermediate representation dimension.
        num_filters (`int`, *optional*, defaults to 32):
            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
        num_residual_layers (`int`,  *optional*, defaults to 1):
            Number of residual layers.
        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the initial convolution.
        last_kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the last convolution layer.
        residual_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the residual layers.
        dilation_growth_rate (`int`, *optional*, defaults to 2):
            How much to increase the dilation with each layer.
        use_causal_conv (`bool`, *optional*, defaults to `True`):
            Whether to use fully causal convolution.
        pad_mode (`str`, *optional*, defaults to `"reflect"`):
            Padding mode for the convolutions.
        compress (`int`, *optional*, defaults to 2):
            Reduced dimensionality in residual branches (from Demucs v3).
        num_lstm_layers (`int`, *optional*, defaults to 2):
            Number of LSTM layers at the end of the encoder.
        trim_right_ratio (`float`, *optional*, defaults to 1.0):
            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
            equal to 1.0, it means that all the trimming is done at the right.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of discret codes that make up VQVAE.
        codebook_dim (`int`, *optional*):
            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
            an identity function will be used, giving a generic residual connection.

    Example:
        ```python
        >>> from transformers import EncodecModel, EncodecConfig
        ...
        >>> # Initializing a "facebook/encodec_24khz" style configuration
        >>> configuration = EncodecConfig()
        ...
        >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
        >>> model = EncodecModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "encodec"
    #pylint: disable=W0102
    def __init__(
        self,
        target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
        sampling_rate=24_000,
        audio_channels=1,
        normalize=False,
        chunk_length_s=None,
        overlap=None,
        hidden_size=128,
        num_filters=32,
        num_residual_layers=1,
        upsampling_ratios = [8, 5, 4, 2],
        norm_type="weight_norm",
        kernel_size=7,
        last_kernel_size=7,
        residual_kernel_size=3,
        dilation_growth_rate=2,
        use_causal_conv=True,
        pad_mode="reflect",
        compress=2,
        num_lstm_layers=2,
        trim_right_ratio=1.0,
        codebook_size=1024,
        codebook_dim=None,
        use_conv_shortcut=True,
        **kwargs,
    ):
        """
        Initializes an instance of the EncodecConfig class.

        Args:
            self: The instance of the class.
            target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
            sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
            audio_channels (int): The number of audio channels. Default is 1.
            normalize (bool): Flag indicating whether to normalize the audio. Default is False.
            chunk_length_s (float): The length of audio chunks in seconds. Default is None.
            overlap (float): The overlap ratio between audio chunks. Default is None.
            hidden_size (int): The size of the hidden state in the model. Default is 128.
            num_filters (int): The number of filters in the model. Default is 32.
            num_residual_layers (int): The number of residual layers in the model. Default is 1.
            upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
            norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
            kernel_size (int): The size of the convolutional kernel. Default is 7.
            last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
            residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
            dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
            use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
            pad_mode (str): The padding mode for convolution. Default is 'reflect'.
            compress (int): The compression factor for audio. Default is 2.
            num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
            trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
            codebook_size (int): The size of the codebook. Default is 1024.
            codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
            use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

        Returns:
            None.

        Raises:
            ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

        """
        self.target_bandwidths = target_bandwidths
        self.sampling_rate = sampling_rate
        self.audio_channels = audio_channels
        self.normalize = normalize
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap
        self.hidden_size = hidden_size
        self.num_filters = num_filters
        self.num_residual_layers = num_residual_layers
        self.upsampling_ratios = upsampling_ratios
        self.norm_type = norm_type
        self.kernel_size = kernel_size
        self.last_kernel_size = last_kernel_size
        self.residual_kernel_size = residual_kernel_size
        self.dilation_growth_rate = dilation_growth_rate
        self.use_causal_conv = use_causal_conv
        self.pad_mode = pad_mode
        self.compress = compress
        self.num_lstm_layers = num_lstm_layers
        self.trim_right_ratio = trim_right_ratio
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
        self.use_conv_shortcut = use_conv_shortcut

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        super().__init__(**kwargs)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        chunk_length
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        chunk_stride
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    @property
    def frame_rate(self) -> int:
        r"""
        frame_rate
        """
        hop_length = np.prod(self.upsampling_ratios)
        return math.ceil(self.sampling_rate / hop_length)

    @property
    def num_quantizers(self) -> int:
        r"""
        num_quantizers
        """
        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * 10))

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_length: Optional[int]` `property` ¶

chunk_length

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_stride: Optional[int]` `property` ¶

chunk_stride

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.frame_rate: int` `property` ¶

frame_rate

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.num_quantizers: int` `property` ¶

num_quantizers

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.init(target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0], sampling_rate=24000, audio_channels=1, normalize=False, chunk_length_s=None, overlap=None, hidden_size=128, num_filters=32, num_residual_layers=1, upsampling_ratios=[8, 5, 4, 2], norm_type='weight_norm', kernel_size=7, last_kernel_size=7, residual_kernel_size=3, dilation_growth_rate=2, use_causal_conv=True, pad_mode='reflect', compress=2, num_lstm_layers=2, trim_right_ratio=1.0, codebook_size=1024, codebook_dim=None, use_conv_shortcut=True, **kwargs) ¶

Initializes an instance of the EncodecConfig class.

PARAMETER	DESCRIPTION
`self`	The instance of the class.
`target_bandwidths`	List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0]. TYPE: `list[float]` DEFAULT: `[1.5, 3.0, 6.0, 12.0, 24.0]`
`sampling_rate`	The audio sampling rate in Hz. Default is 24000. TYPE: `int` DEFAULT: `24000`
`audio_channels`	The number of audio channels. Default is 1. TYPE: `int` DEFAULT: `1`
`normalize`	Flag indicating whether to normalize the audio. Default is False. TYPE: `bool` DEFAULT: `False`
`chunk_length_s`	The length of audio chunks in seconds. Default is None. TYPE: `float` DEFAULT: `None`
`overlap`	The overlap ratio between audio chunks. Default is None. TYPE: `float` DEFAULT: `None`
`hidden_size`	The size of the hidden state in the model. Default is 128. TYPE: `int` DEFAULT: `128`
`num_filters`	The number of filters in the model. Default is 32. TYPE: `int` DEFAULT: `32`
`num_residual_layers`	The number of residual layers in the model. Default is 1. TYPE: `int` DEFAULT: `1`
`upsampling_ratios`	List of upsampling ratios. Default is [8, 5, 4, 2]. TYPE: `list[int]` DEFAULT: `[8, 5, 4, 2]`
`norm_type`	The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'. TYPE: `str` DEFAULT: `'weight_norm'`
`kernel_size`	The size of the convolutional kernel. Default is 7. TYPE: `int` DEFAULT: `7`
`last_kernel_size`	The size of the last convolutional kernel. Default is 7. TYPE: `int` DEFAULT: `7`
`residual_kernel_size`	The size of the residual convolutional kernel. Default is 3. TYPE: `int` DEFAULT: `3`
`dilation_growth_rate`	The growth rate of dilation in the model. Default is 2. TYPE: `int` DEFAULT: `2`
`use_causal_conv`	Flag indicating whether to use causal convolution. Default is True. TYPE: `bool` DEFAULT: `True`
`pad_mode`	The padding mode for convolution. Default is 'reflect'. TYPE: `str` DEFAULT: `'reflect'`
`compress`	The compression factor for audio. Default is 2. TYPE: `int` DEFAULT: `2`
`num_lstm_layers`	The number of LSTM layers in the model. Default is 2. TYPE: `int` DEFAULT: `2`
`trim_right_ratio`	The ratio of trimming audio from the right. Default is 1.0. TYPE: `float` DEFAULT: `1.0`
`codebook_size`	The size of the codebook. Default is 1024. TYPE: `int` DEFAULT: `1024`
`codebook_dim`	The dimension of the codebook. Default is equal to hidden_size if not provided. TYPE: `int` DEFAULT: `None`
`use_conv_shortcut`	Flag indicating whether to use convolution shortcut. Default is True. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
	None.

RAISES	DESCRIPTION
`ValueError`	If norm_type is not 'weight_norm' or 'time_group_norm'.

Source code in mindnlp\transformers\models\encodec\configuration_encodec.py

def __init__(
    self,
    target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
    sampling_rate=24_000,
    audio_channels=1,
    normalize=False,
    chunk_length_s=None,
    overlap=None,
    hidden_size=128,
    num_filters=32,
    num_residual_layers=1,
    upsampling_ratios = [8, 5, 4, 2],
    norm_type="weight_norm",
    kernel_size=7,
    last_kernel_size=7,
    residual_kernel_size=3,
    dilation_growth_rate=2,
    use_causal_conv=True,
    pad_mode="reflect",
    compress=2,
    num_lstm_layers=2,
    trim_right_ratio=1.0,
    codebook_size=1024,
    codebook_dim=None,
    use_conv_shortcut=True,
    **kwargs,
):
    """
    Initializes an instance of the EncodecConfig class.

    Args:
        self: The instance of the class.
        target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
        sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
        audio_channels (int): The number of audio channels. Default is 1.
        normalize (bool): Flag indicating whether to normalize the audio. Default is False.
        chunk_length_s (float): The length of audio chunks in seconds. Default is None.
        overlap (float): The overlap ratio between audio chunks. Default is None.
        hidden_size (int): The size of the hidden state in the model. Default is 128.
        num_filters (int): The number of filters in the model. Default is 32.
        num_residual_layers (int): The number of residual layers in the model. Default is 1.
        upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
        norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
        kernel_size (int): The size of the convolutional kernel. Default is 7.
        last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
        residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
        dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
        use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
        pad_mode (str): The padding mode for convolution. Default is 'reflect'.
        compress (int): The compression factor for audio. Default is 2.
        num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
        trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
        codebook_size (int): The size of the codebook. Default is 1024.
        codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
        use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

    Returns:
        None.

    Raises:
        ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

    """
    self.target_bandwidths = target_bandwidths
    self.sampling_rate = sampling_rate
    self.audio_channels = audio_channels
    self.normalize = normalize
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap
    self.hidden_size = hidden_size
    self.num_filters = num_filters
    self.num_residual_layers = num_residual_layers
    self.upsampling_ratios = upsampling_ratios
    self.norm_type = norm_type
    self.kernel_size = kernel_size
    self.last_kernel_size = last_kernel_size
    self.residual_kernel_size = residual_kernel_size
    self.dilation_growth_rate = dilation_growth_rate
    self.use_causal_conv = use_causal_conv
    self.pad_mode = pad_mode
    self.compress = compress
    self.num_lstm_layers = num_lstm_layers
    self.trim_right_ratio = trim_right_ratio
    self.codebook_size = codebook_size
    self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
    self.use_conv_shortcut = use_conv_shortcut

    if self.norm_type not in ["weight_norm", "time_group_norm"]:
        raise ValueError(
            f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
        )

    super().__init__(**kwargs)

`mindnlp.transformers.models.encodec.feature_extraction_encodec` ¶

Feature extractor class for EnCodec.

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor` ¶

Bases: SequenceFeatureExtractor

Constructs an EnCodec feature extractor.

This feature extractor inherits from [~feature_extraction_sequence_utils.SequenceFeatureExtractor] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

Instantiating a feature extractor with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

PARAMETER	DESCRIPTION
`feature_size`	The feature dimension of the extracted features. Use 1 for mono, 2 for stereo. TYPE: `int`, optional, defaults to 1 DEFAULT: `1`
`sampling_rate`	The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). TYPE: `int`, optional, defaults to 24000 DEFAULT: `24000`
`padding_value`	The value that is used to fill the padding values. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`chunk_length_s`	If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded. TYPE: `float`, optional DEFAULT: `None`
`overlap`	Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following formulae : `int((1.0 - self.overlap) * self.chunk_length)`. TYPE: `float`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py

class EncodecFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs an EnCodec feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
    """
    model_input_names = ["input_values", "padding_mask"]

    def __init__(
        self,
        feature_size: int = 1,
        sampling_rate: int = 24000,
        padding_value: float = 0.0,
        chunk_length_s: float = None,
        overlap: float = None,
        **kwargs,
    ):
        """
        Initialize the EncodecFeatureExtractor class with the given parameters.

        Args:
            self: The instance of the class.
            feature_size (int): The size of the feature. Default is 1.
            sampling_rate (int): The sampling rate in Hz. Default is 24000.
            padding_value (float): The value used for padding. Default is 0.0.
            chunk_length_s (float): The length of each chunk in seconds. Default is None.
            overlap (float): The overlap between chunks in seconds. Default is None.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap

    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    def __call__(
        self,
        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
    ) -> BatchFeature:
        """
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
                (`feature_size = 2`).
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                lengths).
            truncation (`bool`, *optional*, defaults to `False`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
        """
        if sampling_rate is not None:
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                    f" {self.sampling_rate} and not {sampling_rate}."
                )
        else:
            logger.warning(
                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
                "Failing to do so can result in silent errors that might be hard to debug."
            )

        if padding and truncation:
            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
        if padding is None:
            # by default let's pad the inputs
            padding = True

        is_batched = bool(
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if is_batched:
            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
        elif not is_batched and not isinstance(raw_audio, np.ndarray):
            raw_audio = np.asarray(raw_audio, dtype=np.float32)
        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
            raw_audio = raw_audio.astype(np.float32)

        # always return batch
        if not is_batched:
            raw_audio = [np.asarray(raw_audio).T]

        # verify inputs are valid
        for _, example in enumerate(raw_audio):
            if example.ndim > 2:
                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
            if self.feature_size == 1 and example.ndim != 1:
                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
            if self.feature_size == 2 and example.shape[-1] != 2:
                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

        padded_inputs = None
        input_values = BatchFeature({"input_values": raw_audio})
        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
            if truncation:
                max_length = min(array.shape[0] for array in raw_audio)
                nb_step = int(np.floor(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            elif padding:
                max_length = max(array.shape[0] for array in raw_audio)
                nb_step = int(np.ceil(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
                padding = "max_length"
            else:
                padded_inputs = input_values

        # normal padding on batch
        if padded_inputs is None:
            padded_inputs = self.pad(
                input_values,
                max_length=max_length,
                truncation=truncation,
                padding=padding,
                return_attention_mask=padding,
            )
            if padding:
                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

        input_values = []
        for example in padded_inputs.pop("input_values"):
            if self.feature_size == 1:
                example = example[..., None]
            input_values.append(example.T)

        padded_inputs["input_values"] = input_values
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

        return padded_inputs

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_length: Optional[int]` `property` ¶

This is a property because you might want to change the chunk_length_s on the fly¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_stride: Optional[int]` `property` ¶

This is a property because you might want to change the chunk_length_s on the fly¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.call(raw_audio, padding=None, truncation=False, max_length=None, return_tensors=None, sampling_rate=None)` ¶

Main method to featurize and prepare for the model one or several sequence(s).

PARAMETER	DESCRIPTION
`raw_audio`	The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio (`feature_size = 2`). TYPE: `np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`
`padding`	Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). TYPE: `bool`, `str` or [`~utils.PaddingStrategy`], optional, defaults to `True` DEFAULT: `None`
`truncation`	Activates truncation to cut input sequences longer than `max_length` to `max_length`. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`max_length`	Maximum length of the returned list and optionally padding length (see above). TYPE: `int`, optional DEFAULT: `None`
`return_tensors`	If set, will return tensors instead of list of python integers. Acceptable values are: `'tf'`: Return TensorFlow `tf.constant` objects. `'pt'`: Return PyTorch `torch.Tensor` objects. `'np'`: Return Numpy `np.ndarray` objects. TYPE: `str` or [`~utils.TensorType`], optional DEFAULT: `None`
`sampling_rate`	The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass `sampling_rate` at the forward call to prevent silent errors. TYPE: `int`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py

def __call__(
    self,
    raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
    padding: Optional[Union[bool, str, PaddingStrategy]] = None,
    truncation: Optional[bool] = False,
    max_length: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    sampling_rate: Optional[int] = None,
) -> BatchFeature:
    """
    Main method to featurize and prepare for the model one or several sequence(s).

    Args:
        raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
            The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
            values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
            `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
            (`feature_size = 2`).
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding
            index) among:

            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
            sequence if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
            acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
            lengths).
        truncation (`bool`, *optional*, defaults to `False`):
            Activates truncation to cut input sequences longer than `max_length` to `max_length`.
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        return_tensors (`str` or [`~utils.TensorType`], *optional*):
            If set, will return tensors instead of list of python integers. Acceptable values are:

            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return Numpy `np.ndarray` objects.
        sampling_rate (`int`, *optional*):
            The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
            `sampling_rate` at the forward call to prevent silent errors.
    """
    if sampling_rate is not None:
        if sampling_rate != self.sampling_rate:
            raise ValueError(
                f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                f" {self.sampling_rate} and not {sampling_rate}."
            )
    else:
        logger.warning(
            "It is strongly recommended to pass the `sampling_rate` argument to this function. "
            "Failing to do so can result in silent errors that might be hard to debug."
        )

    if padding and truncation:
        raise ValueError("Both padding and truncation were set. Make sure you only set one.")
    if padding is None:
        # by default let's pad the inputs
        padding = True

    is_batched = bool(
        isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
    )

    if is_batched:
        raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
    elif not is_batched and not isinstance(raw_audio, np.ndarray):
        raw_audio = np.asarray(raw_audio, dtype=np.float32)
    elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
        raw_audio = raw_audio.astype(np.float32)

    # always return batch
    if not is_batched:
        raw_audio = [np.asarray(raw_audio).T]

    # verify inputs are valid
    for _, example in enumerate(raw_audio):
        if example.ndim > 2:
            raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
        if self.feature_size == 1 and example.ndim != 1:
            raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
        if self.feature_size == 2 and example.shape[-1] != 2:
            raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

    padded_inputs = None
    input_values = BatchFeature({"input_values": raw_audio})
    if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
        if truncation:
            max_length = min(array.shape[0] for array in raw_audio)
            nb_step = int(np.floor(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
        elif padding:
            max_length = max(array.shape[0] for array in raw_audio)
            nb_step = int(np.ceil(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            padding = "max_length"
        else:
            padded_inputs = input_values

    # normal padding on batch
    if padded_inputs is None:
        padded_inputs = self.pad(
            input_values,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_attention_mask=padding,
        )
        if padding:
            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

    input_values = []
    for example in padded_inputs.pop("input_values"):
        if self.feature_size == 1:
            example = example[..., None]
        input_values.append(example.T)

    padded_inputs["input_values"] = input_values
    if return_tensors is not None:
        padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

    return padded_inputs

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.init(feature_size=1, sampling_rate=24000, padding_value=0.0, chunk_length_s=None, overlap=None, **kwargs)` ¶

Initialize the EncodecFeatureExtractor class with the given parameters.

PARAMETER	DESCRIPTION
`self`	The instance of the class.
`feature_size`	The size of the feature. Default is 1. TYPE: `int` DEFAULT: `1`
`sampling_rate`	The sampling rate in Hz. Default is 24000. TYPE: `int` DEFAULT: `24000`
`padding_value`	The value used for padding. Default is 0.0. TYPE: `float` DEFAULT: `0.0`
`chunk_length_s`	The length of each chunk in seconds. Default is None. TYPE: `float` DEFAULT: `None`
`overlap`	The overlap between chunks in seconds. Default is None. TYPE: `float` DEFAULT: `None`
`**kwargs`	Additional keyword arguments. DEFAULT: `{}`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\encodec\feature_extraction_encodec.py

def __init__(
    self,
    feature_size: int = 1,
    sampling_rate: int = 24000,
    padding_value: float = 0.0,
    chunk_length_s: float = None,
    overlap: float = None,
    **kwargs,
):
    """
    Initialize the EncodecFeatureExtractor class with the given parameters.

    Args:
        self: The instance of the class.
        feature_size (int): The size of the feature. Default is 1.
        sampling_rate (int): The sampling rate in Hz. Default is 24000.
        padding_value (float): The value used for padding. Default is 0.0.
        chunk_length_s (float): The length of each chunk in seconds. Default is None.
        overlap (float): The overlap between chunks in seconds. Default is None.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap

encodec

mindnlp.transformers.models.encodec.modeling_encodec ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoderOutput dataclass ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoderOutput dataclass ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.decode(audio_codes, audio_scales, padding_mask=None, return_dict=None) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.encode(input_values, padding_mask=None, bandwidth=None, return_dict=None) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.forward(input_values, padding_mask=None, bandwidth=None, audio_codes=None, audio_scales=None, return_dict=None) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecOutput dataclass ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.decode(codes) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.encode(embeddings, bandwidth=None) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth(bandwidth=None) ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock ¶

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization ¶

mindnlp.transformers.models.encodec.configuration_encodec ¶

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig ¶

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_length: Optional[int] property ¶

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_stride: Optional[int] property ¶

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.frame_rate: int property ¶

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.num_quantizers: int property ¶

mindnlp.transformers.models.encodec.feature_extraction_encodec ¶

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor ¶

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_length: Optional[int] property ¶

This is a property because you might want to change the chunk_length_s on the fly¶

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_stride: Optional[int] property ¶

This is a property because you might want to change the chunk_length_s on the fly¶

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__call__(raw_audio, padding=None, truncation=False, max_length=None, return_tensors=None, sampling_rate=None) ¶

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__init__(feature_size=1, sampling_rate=24000, padding_value=0.0, chunk_length_s=None, overlap=None, **kwargs) ¶

`mindnlp.transformers.models.encodec.modeling_encodec` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoderOutput` `dataclass` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoderOutput` `dataclass` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.decode(audio_codes, audio_scales, padding_mask=None, return_dict=None)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.encode(input_values, padding_mask=None, bandwidth=None, return_dict=None)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.forward(input_values, padding_mask=None, bandwidth=None, audio_codes=None, audio_scales=None, return_dict=None)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecOutput` `dataclass` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.decode(codes)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.encode(embeddings, bandwidth=None)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth(bandwidth=None)` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock` ¶

`mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization` ¶

`mindnlp.transformers.models.encodec.configuration_encodec` ¶

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig` ¶

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_length: Optional[int]` `property` ¶

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_stride: Optional[int]` `property` ¶

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.frame_rate: int` `property` ¶

`mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.num_quantizers: int` `property` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_length: Optional[int]` `property` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_stride: Optional[int]` `property` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.call(raw_audio, padding=None, truncation=False, max_length=None, return_tensors=None, sampling_rate=None)` ¶

`mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.init(feature_size=1, sampling_rate=24000, padding_value=0.0, chunk_length_s=None, overlap=None, **kwargs)` ¶