whisper

`mindnlp.transformers.models.whisper.modeling_whisper` ¶

PyTorch Whisper model.

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention` ¶

Bases: Module

Multi-headed attention from 'Attention Is All You Need' paper

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        layer_idx: Optional[int] = None,
        config: Optional[WhisperConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        if layer_idx is None and is_decoder:
            logger.warning_once(
                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )
        self.layer_idx = layer_idx

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper
    def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
        return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2)

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        key_value_states: Optional[mindspore.Tensor] = None,
        past_key_value: Optional[EncoderDecoderCache] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        layer_head_mask: Optional[mindspore.Tensor] = None,
        output_attentions: bool = False,
        cache_position: Optional[mindspore.Tensor] = None,
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None
        bsz, tgt_len, _ = hidden_states.shape

        # get query proj
        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)

        if past_key_value is not None:
            is_updated = past_key_value.is_updated.get(self.layer_idx)
            if is_cross_attention:
                # after the first generated id, we can subsequently re-use all key/value_states from cache
                past_key_value.is_updated[self.layer_idx] = True
                past_key_value = past_key_value.cross_attention_cache
            else:
                past_key_value = past_key_value.self_attention_cache

        # use key_value_states if cross attention
        current_states = key_value_states if key_value_states is not None else hidden_states
        if is_cross_attention and past_key_value and is_updated:
            # reuse k,v, cross_attentions
            key_states = past_key_value.key_cache[self.layer_idx]
            value_states = past_key_value.value_cache[self.layer_idx]
        else:
            key_states = self._shape(self.k_proj(current_states), -1, bsz)
            value_states = self._shape(self.v_proj(current_states), -1, bsz)
            if past_key_value is not None:
                # save all key/value_states to cache to be re-used for fast auto-regressive generation
                cache_position = cache_position if not is_cross_attention else None
                key_states, value_states = past_key_value.update(
                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                )

        attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3))

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
            if layer_head_mask.shape != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                    f" {layer_head_mask.shape}"
                )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights

        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = ops.matmul(attn_probs, value_states)

        if attn_output.shape != (bsz, self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.shape}"
            )

        attn_output = ops.transpose(attn_output, 1, 2)
        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
        # partitioned across GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights, past_key_value

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention.forward(hidden_states, key_value_states=None, past_key_value=None, attention_mask=None, layer_head_mask=None, output_attentions=False, cache_position=None)` ¶

Input shape: Batch x Time x Channel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    hidden_states: mindspore.Tensor,
    key_value_states: Optional[mindspore.Tensor] = None,
    past_key_value: Optional[EncoderDecoderCache] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    layer_head_mask: Optional[mindspore.Tensor] = None,
    output_attentions: bool = False,
    cache_position: Optional[mindspore.Tensor] = None,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
    """Input shape: Batch x Time x Channel"""

    # if key_value_states are provided this layer is used as a cross-attention layer
    # for the decoder
    is_cross_attention = key_value_states is not None
    bsz, tgt_len, _ = hidden_states.shape

    # get query proj
    query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)

    if past_key_value is not None:
        is_updated = past_key_value.is_updated.get(self.layer_idx)
        if is_cross_attention:
            # after the first generated id, we can subsequently re-use all key/value_states from cache
            past_key_value.is_updated[self.layer_idx] = True
            past_key_value = past_key_value.cross_attention_cache
        else:
            past_key_value = past_key_value.self_attention_cache

    # use key_value_states if cross attention
    current_states = key_value_states if key_value_states is not None else hidden_states
    if is_cross_attention and past_key_value and is_updated:
        # reuse k,v, cross_attentions
        key_states = past_key_value.key_cache[self.layer_idx]
        value_states = past_key_value.value_cache[self.layer_idx]
    else:
        key_states = self._shape(self.k_proj(current_states), -1, bsz)
        value_states = self._shape(self.v_proj(current_states), -1, bsz)
        if past_key_value is not None:
            # save all key/value_states to cache to be re-used for fast auto-regressive generation
            cache_position = cache_position if not is_cross_attention else None
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, {"cache_position": cache_position}
            )

    attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3))

    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1)

    if layer_head_mask is not None:
        if layer_head_mask.shape != (self.num_heads,):
            raise ValueError(
                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                f" {layer_head_mask.shape}"
            )
        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights

    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
    attn_output = ops.matmul(attn_probs, value_states)

    if attn_output.shape != (bsz, self.num_heads, tgt_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
            f" {attn_output.shape}"
        )

    attn_output = ops.transpose(attn_output, 1, 2)
    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
    # partitioned across GPUs when using tensor-parallelism.
    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

    attn_output = self.out_proj(attn_output)

    return attn_output, attn_weights, past_key_value

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoder` ¶

Bases: WhisperPreTrainedModel

Transformer decoder consisting of config.decoder_layers layers. Each layer is a [WhisperDecoderLayer]

PARAMETER	DESCRIPTION
`config`	WhisperConfig TYPE: `WhisperConfig`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperDecoder(WhisperPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`]

    Args:
        config: WhisperConfig
    """

    main_input_name = "input_ids"

    def __init__(self, config: WhisperConfig):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_target_positions
        self.max_source_positions = config.max_source_positions
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
        self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)

        self.layers = nn.ModuleList(
            [WhisperDecoderLayer(config, layer_idx) for layer_idx in range(config.decoder_layers)]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self._use_sdpa = config._attn_implementation == "sdpa"

        self.layer_norm = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        position_ids=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        cache_position=None,
    ):
        r"""
        Args:
            input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`mindspore.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
                on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`EncoderDecoderCache` or `tuple(tuple(mindspore.Tensor))`, *optional*):
                Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
                four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
                in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
                when `config.use_cache=True`

                Two formats are allowed:
                - An [`~cache_utils.EncoderDecoderCache`] instance;
                - Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
                `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`mindspore.Tensor` of
                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
                control over how to convert `input_ids` indices into associated vectors than the model's internal
                embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            cache_position (`mindspore.Tensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.shape
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
        else:
            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        return_legacy_cache = False
        return_self_attention_cache = False
        if use_cache or past_key_values is not None:
            if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
                return_self_attention_cache = True
                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
            elif not isinstance(past_key_values, EncoderDecoderCache):
                return_legacy_cache = True
                logger.warning_once(
                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
                    "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
                    "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
                )
                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)

        past_key_values_length = 0
        if cache_position is not None:
            past_key_values_length = cache_position[0]
        elif past_key_values is not None:
            past_key_values_length = past_key_values.get_seq_length()

        if cache_position is None:
            cache_position = ops.arange(
                past_key_values_length, past_key_values_length + input_shape[1]
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        # embed positions
        if input_ids is not None:
            positions = self.embed_positions(
                input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids
            )
        else:
            positions = self.embed_positions(
                inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
            )

        hidden_states = inputs_embeds + positions
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values.self_attention_cache if past_key_values is not None else None,
            output_attentions,
        )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
                )
                use_cache = False
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
            if attn_mask is not None:
                assert attn_mask.shape[0] == (len(self.layers)), (
                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                    f" {head_mask.shape[0]}."
                )
        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            if self.training:
                dropout_probability = ops.rand([])
                if dropout_probability < self.layerdrop:
                    continue

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    encoder_hidden_states,
                    None,  # encoder attention mask
                    head_mask[idx] if head_mask is not None else None,
                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                    None,  # past_key_value
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                    cross_attn_layer_head_mask=(
                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                    ),
                    past_key_value=past_key_values if use_cache else None,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )
            hidden_states = layer_outputs[0]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        hidden_states = self.layer_norm(hidden_states)
        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = past_key_values if use_cache else None
        if return_self_attention_cache:
            next_cache = past_key_values.self_attention_cache
        if return_legacy_cache:
            next_cache = past_key_values.to_legacy_cache()
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: mindspore.Tensor,
        input_tensor: mindspore.Tensor,
        cache_position: mindspore.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype = input_tensor.dtype
        min_dtype = float(ops.finfo(dtype).min)
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, mindspore.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
            min_dtype=min_dtype,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )

        return causal_mask

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoder.forward(input_ids=None, attention_mask=None, encoder_hidden_states=None, head_mask=None, cross_attn_head_mask=None, past_key_values=None, inputs_embeds=None, position_ids=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, cache_position=None)` ¶

PARAMETER	DESCRIPTION
`input_ids`	Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. What are input IDs? TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)` DEFAULT: `None`
`attention_mask`	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: 1 for tokens that are not masked, 0 for tokens that are masked. What are attention masks? TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)`, optional DEFAULT: `None`
`encoder_hidden_states`	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. TYPE: `mindspore.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, optional DEFAULT: `None`
`head_mask`	Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: 1 indicates the head is not masked, 0 indicates the head is masked. TYPE: `mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, optional DEFAULT: `None`
`cross_attn_head_mask`	Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention on hidden heads. Mask values selected in `[0, 1]`: 1 indicates the head is not masked, 0 indicates the head is masked. TYPE: `mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, optional DEFAULT: `None`
`past_key_values`	Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or when `config.use_cache=True` Two formats are allowed: - An [`~cache_utils.EncoderDecoderCache`] instance; - Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. TYPE: `EncoderDecoderCache` or `tuple(tuple(mindspore.Tensor))`, optional DEFAULT: `None`
`output_attentions`	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`output_hidden_states`	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`return_dict`	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. TYPE: `bool`, optional DEFAULT: `None`
`cache_position`	Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache in the correct position and to infer the complete sequence length. TYPE: `mindspore.Tensor` of shape `(sequence_length)`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_ids=None,
    attention_mask=None,
    encoder_hidden_states=None,
    head_mask=None,
    cross_attn_head_mask=None,
    past_key_values=None,
    inputs_embeds=None,
    position_ids=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
    cache_position=None,
):
    r"""
    Args:
        input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
            provide it.

            Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        encoder_hidden_states (`mindspore.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
            on hidden heads. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        past_key_values (`EncoderDecoderCache` or `tuple(tuple(mindspore.Tensor))`, *optional*):
            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
            four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
            in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
            when `config.use_cache=True`

            Two formats are allowed:
            - An [`~cache_utils.EncoderDecoderCache`] instance;
            - Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
            that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
            all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`mindspore.Tensor` of
            shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
            `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
            control over how to convert `input_ids` indices into associated vectors than the model's internal
            embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
            for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`mindspore.Tensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
            cache in the correct position and to infer the complete sequence length.
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # retrieve input_ids and inputs_embeds
    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
    elif input_ids is not None:
        input_shape = input_ids.shape
        input_ids = input_ids.view(-1, input_shape[-1])
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.shape[:-1]
    else:
        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)

    return_legacy_cache = False
    return_self_attention_cache = False
    if use_cache or past_key_values is not None:
        if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
            return_self_attention_cache = True
            past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
        elif not isinstance(past_key_values, EncoderDecoderCache):
            return_legacy_cache = True
            logger.warning_once(
                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
            )
            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)

    past_key_values_length = 0
    if cache_position is not None:
        past_key_values_length = cache_position[0]
    elif past_key_values is not None:
        past_key_values_length = past_key_values.get_seq_length()

    if cache_position is None:
        cache_position = ops.arange(
            past_key_values_length, past_key_values_length + input_shape[1]
        )

    if position_ids is None:
        position_ids = cache_position.unsqueeze(0)

    # embed positions
    if input_ids is not None:
        positions = self.embed_positions(
            input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids
        )
    else:
        positions = self.embed_positions(
            inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
        )

    hidden_states = inputs_embeds + positions
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

    causal_mask = self._update_causal_mask(
        attention_mask,
        inputs_embeds,
        cache_position,
        past_key_values.self_attention_cache if past_key_values is not None else None,
        output_attentions,
    )

    if self.gradient_checkpointing and self.training:
        if use_cache:
            logger.warning_once(
                "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
            )
            use_cache = False
    # decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

    # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
    for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
        if attn_mask is not None:
            assert attn_mask.shape[0] == (len(self.layers)), (
                f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                f" {head_mask.shape[0]}."
            )
    for idx, decoder_layer in enumerate(self.layers):
        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
        if output_hidden_states:
            all_hidden_states += (hidden_states,)
        if self.training:
            dropout_probability = ops.rand([])
            if dropout_probability < self.layerdrop:
                continue

        if self.gradient_checkpointing and self.training:
            layer_outputs = self._gradient_checkpointing_func(
                decoder_layer.__call__,
                hidden_states,
                causal_mask,
                encoder_hidden_states,
                None,  # encoder attention mask
                head_mask[idx] if head_mask is not None else None,
                cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                None,  # past_key_value
                output_attentions,
                use_cache,
                cache_position,
            )
        else:
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                encoder_hidden_states=encoder_hidden_states,
                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                cross_attn_layer_head_mask=(
                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                ),
                past_key_value=past_key_values if use_cache else None,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
            )
        hidden_states = layer_outputs[0]

        if output_attentions:
            all_self_attns += (layer_outputs[1],)

            if encoder_hidden_states is not None:
                all_cross_attentions += (layer_outputs[2],)

    hidden_states = self.layer_norm(hidden_states)
    # add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = past_key_values if use_cache else None
    if return_self_attention_cache:
        next_cache = past_key_values.self_attention_cache
    if return_legacy_cache:
        next_cache = past_key_values.to_legacy_cache()
    if not return_dict:
        return tuple(
            v
            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
            if v is not None
        )
    return BaseModelOutputWithPastAndCrossAttentions(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
        cross_attentions=all_cross_attentions,
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderLayer` ¶

Bases: Module

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperDecoderLayer(nn.Module):
    def __init__(self, config: WhisperConfig, layer_idx: int = None):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            layer_idx=layer_idx,
            config=config,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.encoder_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            layer_idx=layer_idx,
            config=config,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        attention_mask: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        layer_head_mask: Optional[mindspore.Tensor] = None,
        cross_attn_layer_head_mask: Optional[mindspore.Tensor] = None,
        past_key_value: Optional[EncoderDecoderCache] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
        cache_position: Optional[mindspore.Tensor] = None,
    ) -> mindspore.Tensor:
        """
        Args:
            hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`mindspore.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`mindspore.Tensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`mindspore.Tensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`mindspore.Tensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(mindspore.Tensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
            cache_position=cache_position,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        # Cross-Attention Block
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            residual = hidden_states
            hidden_states = self.encoder_attn_layer_norm(hidden_states)
            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
            )
            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states

            # add cross-attn to positions 1 of present_key_value tuple
            present_key_value = (present_key_value, cross_attn_present_key_value)

        # Fully Connected
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        if use_cache:
            outputs += (present_key_value,)

        return outputs

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderLayer.forward(hidden_states, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, layer_head_mask=None, cross_attn_layer_head_mask=None, past_key_value=None, output_attentions=False, use_cache=True, cache_position=None)` ¶

PARAMETER	DESCRIPTION
`hidden_states`	input to the layer of shape `(batch, seq_len, embed_dim)` TYPE: `mindspore.Tensor`
`attention_mask`	attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. TYPE: `mindspore.Tensor` DEFAULT: `None`
`encoder_hidden_states`	cross attention input to the layer of shape `(batch, seq_len, embed_dim)` TYPE: `mindspore.Tensor` DEFAULT: `None`
`encoder_attention_mask`	encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. TYPE: `mindspore.Tensor` DEFAULT: `None`
`layer_head_mask`	mask for attention heads in a given layer of size `(encoder_attention_heads,)`. TYPE: `mindspore.Tensor` DEFAULT: `None`
`cross_attn_layer_head_mask`	mask for cross-attention heads in a given layer of size `(decoder_attention_heads,)`. TYPE: `mindspore.Tensor` DEFAULT: `None`
`past_key_value`	cached past key and value projection states TYPE: `Tuple(mindspore.Tensor)` DEFAULT: `None`
`output_attentions`	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `False`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    hidden_states: mindspore.Tensor,
    attention_mask: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    layer_head_mask: Optional[mindspore.Tensor] = None,
    cross_attn_layer_head_mask: Optional[mindspore.Tensor] = None,
    past_key_value: Optional[EncoderDecoderCache] = None,
    output_attentions: Optional[bool] = False,
    use_cache: Optional[bool] = True,
    cache_position: Optional[mindspore.Tensor] = None,
) -> mindspore.Tensor:
    """
    Args:
        hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`mindspore.Tensor`): attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        encoder_hidden_states (`mindspore.Tensor`):
            cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
        encoder_attention_mask (`mindspore.Tensor`): encoder attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
            `(encoder_attention_heads,)`.
        cross_attn_layer_head_mask (`mindspore.Tensor`): mask for cross-attention heads in a given layer of
            size `(decoder_attention_heads,)`.
        past_key_value (`Tuple(mindspore.Tensor)`): cached past key and value projection states
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
    """
    residual = hidden_states
    hidden_states = self.self_attn_layer_norm(hidden_states)

    # Self Attention
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        past_key_value=past_key_value,
        attention_mask=attention_mask,
        layer_head_mask=layer_head_mask,
        output_attentions=output_attentions,
        cache_position=cache_position,
    )
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
    hidden_states = residual + hidden_states

    # Cross-Attention Block
    cross_attn_weights = None
    if encoder_hidden_states is not None:
        residual = hidden_states
        hidden_states = self.encoder_attn_layer_norm(hidden_states)
        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
            hidden_states=hidden_states,
            key_value_states=encoder_hidden_states,
            attention_mask=encoder_attention_mask,
            layer_head_mask=cross_attn_layer_head_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        # add cross-attn to positions 1 of present_key_value tuple
        present_key_value = (present_key_value, cross_attn_present_key_value)

    # Fully Connected
    residual = hidden_states
    hidden_states = self.final_layer_norm(hidden_states)
    hidden_states = self.activation_fn(self.fc1(hidden_states))
    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
    hidden_states = self.fc2(hidden_states)
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
    hidden_states = residual + hidden_states

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights, cross_attn_weights)

    if use_cache:
        outputs += (present_key_value,)

    return outputs

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderWrapper` ¶

Bases: WhisperPreTrainedModel

This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is used in combination with the [EncoderDecoderModel] framework.

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperDecoderWrapper(WhisperPreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    def __init__(self, config):
        super().__init__(config)
        config.is_encoder_decoder = False
        self.decoder = WhisperDecoder(config)

    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    def set_input_embeddings(self, value):
        self.decoder.embed_tokens = value

    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder` ¶

Bases: WhisperPreTrainedModel

Transformer encoder consisting of config.encoder_layers self attention layers. Each layer is a [WhisperEncoderLayer].

PARAMETER	DESCRIPTION
`config`	WhisperConfig TYPE: `WhisperConfig`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperEncoder(WhisperPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`WhisperEncoderLayer`].

    Args:
        config: WhisperConfig
    """

    def __init__(self, config: WhisperConfig):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.num_mel_bins = config.num_mel_bins
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_source_positions
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)

        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
        self.embed_positions.requires_grad_(False)

        self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layer_norm = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def get_input_embeddings(self) -> nn.Module:
        return self.conv1

    def set_input_embeddings(self, value: nn.Module):
        self.conv1 = value

    def forward(
        self,
        input_features,
        attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Args:
            input_features (`mindspore.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `mindspore.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`mindspore.Tensor`)`, *optional*):
                Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
            head_mask (`mindspore.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """

        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
        if input_features.shape[-1] != expected_seq_length:
            raise ValueError(
                f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
            )

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))

        inputs_embeds = inputs_embeds.permute(0, 2, 1)
        embed_pos = self.embed_positions.weight

        hidden_states = inputs_embeds + embed_pos
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
            assert head_mask.shape[0] == (
                len(self.layers)
            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.shape[0]}."

        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            to_drop = False
            if self.training:
                dropout_probability = ops.rand([])
                if dropout_probability < self.layerdrop:  # skip the layer
                    to_drop = True

            if to_drop:
                layer_outputs = (None, None)
            else:
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        encoder_layer.__call__,
                        hidden_states,
                        None,
                        (head_mask[idx] if head_mask is not None else None),
                        output_attentions,
                    )
                else:
                    layer_outputs = encoder_layer(
                        hidden_states,
                        None,
                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                        output_attentions=output_attentions,
                    )

                hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        hidden_states = self.layer_norm(hidden_states)
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder.forward(input_features, attention_mask=None, head_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

PARAMETER	DESCRIPTION
`input_features`	Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, e.g. via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `mindspore.Tensor`. See [`~WhisperFeatureExtractor.__call__`] TYPE: `mindspore.Tensor` of shape `(batch_size, feature_size, sequence_length)`
`attention_mask`	Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but it is not used. By default the silence in the input log mel spectrogram are ignored. TYPE: `mindspore.Tensor`)`, optional DEFAULT: `None`
`head_mask`	Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: 1 indicates the head is not masked, 0 indicates the head is masked. TYPE: `mindspore.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, optional DEFAULT: `None`
`output_attentions`	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`output_hidden_states`	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`return_dict`	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. TYPE: `bool`, optional DEFAULT: `None`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_features,
    attention_mask=None,
    head_mask=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
):
    r"""
    Args:
        input_features (`mindspore.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `mindspore.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
        attention_mask (`mindspore.Tensor`)`, *optional*):
            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
            but it is not used. By default the silence in the input log mel spectrogram are ignored.
        head_mask (`mindspore.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
            for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    """

    expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
    if input_features.shape[-1] != expected_seq_length:
        raise ValueError(
            f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
        )

    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    inputs_embeds = nn.functional.gelu(self.conv1(input_features))
    inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))

    inputs_embeds = inputs_embeds.permute(0, 2, 1)
    embed_pos = self.embed_positions.weight

    hidden_states = inputs_embeds + embed_pos
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

    encoder_states = () if output_hidden_states else None
    all_attentions = () if output_attentions else None

    # check if head_mask has a correct number of layers specified if desired
    if head_mask is not None:
        assert head_mask.shape[0] == (
            len(self.layers)
        ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.shape[0]}."

    for idx, encoder_layer in enumerate(self.layers):
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)
        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
        to_drop = False
        if self.training:
            dropout_probability = ops.rand([])
            if dropout_probability < self.layerdrop:  # skip the layer
                to_drop = True

        if to_drop:
            layer_outputs = (None, None)
        else:
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    None,
                    (head_mask[idx] if head_mask is not None else None),
                    output_attentions,
                )
            else:
                layer_outputs = encoder_layer(
                    hidden_states,
                    None,
                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                    output_attentions=output_attentions,
                )

            hidden_states = layer_outputs[0]

        if output_attentions:
            all_attentions = all_attentions + (layer_outputs[1],)

    hidden_states = self.layer_norm(hidden_states)
    if output_hidden_states:
        encoder_states = encoder_states + (hidden_states,)

    if not return_dict:
        return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
    return BaseModelOutput(
        last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer` ¶

Bases: Module

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperEncoderLayer(nn.Module):
    def __init__(self, config: WhisperConfig):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        attention_mask: mindspore.Tensor,
        layer_head_mask: mindspore.Tensor,
        output_attentions: bool = False,
    ) -> mindspore.Tensor:
        """
        Args:
            hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`mindspore.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        if hidden_states.dtype == mindspore.float16 and (
            ops.isinf(hidden_states).any() or ops.isnan(hidden_states).any()
        ):
            clamp_value = FloatingPointError(ops.finfo(hidden_states.dtype).max) - 1000
            hidden_states = ops.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer.forward(hidden_states, attention_mask, layer_head_mask, output_attentions=False)` ¶

PARAMETER	DESCRIPTION
`hidden_states`	input to the layer of shape `(batch, seq_len, embed_dim)` TYPE: `mindspore.Tensor`
`attention_mask`	attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. TYPE: `mindspore.Tensor`
`layer_head_mask`	mask for attention heads in a given layer of size `(encoder_attention_heads,)`. TYPE: `mindspore.Tensor`
`output_attentions`	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `False`

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    hidden_states: mindspore.Tensor,
    attention_mask: mindspore.Tensor,
    layer_head_mask: mindspore.Tensor,
    output_attentions: bool = False,
) -> mindspore.Tensor:
    """
    Args:
        hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`mindspore.Tensor`): attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
            `(encoder_attention_heads,)`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
    """
    residual = hidden_states
    hidden_states = self.self_attn_layer_norm(hidden_states)
    hidden_states, attn_weights, _ = self.self_attn(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        layer_head_mask=layer_head_mask,
        output_attentions=output_attentions,
    )
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
    hidden_states = residual + hidden_states

    residual = hidden_states
    hidden_states = self.final_layer_norm(hidden_states)
    hidden_states = self.activation_fn(self.fc1(hidden_states))
    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
    hidden_states = self.fc2(hidden_states)
    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
    hidden_states = residual + hidden_states

    if hidden_states.dtype == mindspore.float16 and (
        ops.isinf(hidden_states).any() or ops.isnan(hidden_states).any()
    ):
        clamp_value = FloatingPointError(ops.finfo(hidden_states.dtype).max) - 1000
        hidden_states = ops.clamp(hidden_states, min=-clamp_value, max=clamp_value)

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (attn_weights,)

    return outputs

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification` ¶

Bases: WhisperPreTrainedModel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperForAudioClassification(WhisperPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.encoder = WhisperEncoder(config)
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def freeze_encoder(self):
        """
        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
        not be updated during training. Only the projection layers and classification head will be updated.
        """
        self.encoder._freeze_parameters()

    def get_input_embeddings(self) -> nn.Module:
        return self.encoder.get_input_embeddings()

    def set_input_embeddings(self, value: nn.Module):
        self.encoder.set_input_embeddings(value)

    def forward(
        self,
        input_features: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
        >>> from datasets import load_dataset

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
        >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")

        >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
        >>> sample = next(iter(ds))

        >>> inputs = feature_extractor(
        ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="ms"
        ... )
        >>> input_features = inputs.input_features

        >>> with no_grad():
        ...     logits = model(input_features).logits

        >>> predicted_class_ids = ops.argmax(logits).item()
        >>> predicted_label = model.config.id2label[predicted_class_ids]
        >>> predicted_label
        'Afrikaans'
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        if self.config.use_weighted_layer_sum:
            output_hidden_states = True
        elif output_hidden_states is None:
            output_hidden_states = self.config.output_hidden_states

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_features,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

        if self.config.use_weighted_layer_sum:
            hidden_states = encoder_outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = ops.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1)
        else:
            hidden_states = encoder_outputs[0]

        hidden_states = self.projector(hidden_states)
        pooled_output = ops.mean(hidden_states, dim=1)

        logits = self.classifier(pooled_output)

        loss = None

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + encoder_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.forward(input_features=None, head_mask=None, encoder_outputs=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

Returns:

Example:

>>> import torch
>>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
>>> from datasets import load_dataset

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
>>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")

>>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
>>> sample = next(iter(ds))

>>> inputs = feature_extractor(
...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="ms"
... )
>>> input_features = inputs.input_features

>>> with no_grad():
...     logits = model(input_features).logits

>>> predicted_class_ids = ops.argmax(logits).item()
>>> predicted_label = model.config.id2label[predicted_class_ids]
>>> predicted_label
'Afrikaans'

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_features: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], SequenceClassifierOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:

    Example:

    ```python
    >>> import torch
    >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
    >>> from datasets import load_dataset

    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
    >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")

    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
    >>> sample = next(iter(ds))

    >>> inputs = feature_extractor(
    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="ms"
    ... )
    >>> input_features = inputs.input_features

    >>> with no_grad():
    ...     logits = model(input_features).logits

    >>> predicted_class_ids = ops.argmax(logits).item()
    >>> predicted_label = model.config.id2label[predicted_class_ids]
    >>> predicted_label
    'Afrikaans'
    ```"""

    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    if self.config.use_weighted_layer_sum:
        output_hidden_states = True
    elif output_hidden_states is None:
        output_hidden_states = self.config.output_hidden_states

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if encoder_outputs is None:
        encoder_outputs = self.encoder(
            input_features,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

    if self.config.use_weighted_layer_sum:
        hidden_states = encoder_outputs[_HIDDEN_STATES_START_POSITION]
        hidden_states = ops.stack(hidden_states, dim=1)
        norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
        hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1)
    else:
        hidden_states = encoder_outputs[0]

    hidden_states = self.projector(hidden_states)
    pooled_output = ops.mean(hidden_states, dim=1)

    logits = self.classifier(pooled_output)

    loss = None

    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

    if not return_dict:
        output = (logits,) + encoder_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=encoder_outputs.hidden_states,
        attentions=encoder_outputs.attentions,
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.freeze_encoder()` ¶

Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will not be updated during training. Only the projection layers and classification head will be updated.

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def freeze_encoder(self):
    """
    Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
    not be updated during training. Only the projection layers and classification head will be updated.
    """
    self.encoder._freeze_parameters()

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForCausalLM` ¶

Bases: WhisperPreTrainedModel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperForCausalLM(WhisperPreTrainedModel):
    _tied_weights_keys = ["proj_out.weight"]
    main_input_name = "input_ids"

    def __init__(self, config):
        super().__init__(config)
        config.is_encoder_decoder = False
        self.model = WhisperDecoderWrapper(config)

        self.proj_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.proj_out

    def set_output_embeddings(self, new_embeddings):
        self.proj_out = new_embeddings

    def get_input_embeddings(self) -> nn.Module:
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def set_decoder(self, decoder):
        self.model.decoder = decoder

    def get_decoder(self):
        return self.model.decoder

    def forward(
        self,
        input_ids: mindspore.Tensor = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        encoder_outputs: Optional[Tuple[mindspore.Tensor]] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        cross_attn_head_mask: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[mindspore.Tensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        Args:
            input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
            attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            encoder_outputs  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                if the model is configured as a decoder.
            head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains
                pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
                blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If
                `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
                `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            cache_position (`mindspore.Tensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
                in the correct position and to infer the complete sequence length.

        Returns:

        Example:

        ```python
        >>> from transformers import WhisperForCausalLM, WhisperForConditionalGeneration, WhisperProcessor
        >>> import torch
        >>> from datasets import load_dataset

        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

        >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> sample = ds[0]["audio"]
        >>> input_features = processor(
        ...     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="ms"
        ... ).input_features

        >>> predicted_ids = model.generate(input_features, assistant_model=assistant_model)

        >>> # decode token ids to text
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        >>> transcription
        ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # If the user passed a tuple or `BaseModelOutput` for encoder_outputs, we extract only the hidden states
        if isinstance(encoder_outputs, (BaseModelOutput, tuple, list)):
            encoder_outputs = encoder_outputs[0]

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_outputs,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        logits = self.proj_out(outputs[0])

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        use_cache=None,
        encoder_outputs=None,
        attention_mask=None,
        cache_position=None,
        **kwargs,
    ):
        past_length = 0
        if past_key_values is not None:
            if isinstance(past_key_values, (Cache, EncoderDecoderCache)):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
            else:
                past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        if cache_position is None:
            cache_position = ops.arange(past_length, past_length + input_ids.shape[1])
        elif use_cache:
            cache_position = cache_position[-input_ids.shape[1] :]

        return {
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "input_ids": input_ids,
            "use_cache": use_cache,
            "attention_mask": attention_mask,
            "cache_position": cache_position,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),
            )
        return reordered_past

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForCausalLM.forward(input_ids=None, attention_mask=None, encoder_outputs=None, head_mask=None, cross_attn_head_mask=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, cache_position=None)` ¶

PARAMETER	DESCRIPTION
`input_ids`	Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. What are input IDs? TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)` DEFAULT: `None`
`attention_mask`	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are not masked, - 0 for tokens that are masked. What are attention masks? TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)`, optional DEFAULT: `None`
`encoder_outputs`	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. TYPE: (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, optional DEFAULT: `None`
`head_mask`	Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is not masked, - 0 indicates the head is masked. TYPE: `mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, optional DEFAULT: `None`
`cross_attn_head_mask`	Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is not masked, - 0 indicates the head is masked. TYPE: `mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, optional DEFAULT: `None`
`past_key_values`	Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. TYPE: `tuple(tuple(mindspore.Tensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True` DEFAULT: `None`
`inputs_embeds`	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, optional DEFAULT: `None`
`labels`	Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)`, optional DEFAULT: `None`
`use_cache`	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). - 1 for tokens that are not masked, - 0 for tokens that are masked. TYPE: `bool`, optional DEFAULT: `None`
`output_attentions`	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`output_hidden_states`	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. TYPE: `bool`, optional DEFAULT: `None`
`return_dict`	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. TYPE: `bool`, optional DEFAULT: `None`
`cache_position`	Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache in the correct position and to infer the complete sequence length. TYPE: `mindspore.Tensor` of shape `(sequence_length)`, optional DEFAULT: `None`

Example:

>>> from transformers import WhisperForCausalLM, WhisperForConditionalGeneration, WhisperProcessor
>>> import torch
>>> from datasets import load_dataset

>>> processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

>>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> sample = ds[0]["audio"]
>>> input_features = processor(
...     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="ms"
... ).input_features

>>> predicted_ids = model.generate(input_features, assistant_model=assistant_model)

>>> # decode token ids to text
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
>>> transcription
' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_ids: mindspore.Tensor = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    encoder_outputs: Optional[Tuple[mindspore.Tensor]] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    cross_attn_head_mask: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[mindspore.Tensor] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
    r"""
    Args:
        input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
            provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
        attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        encoder_outputs  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            if the model is configured as a decoder.
        head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`mindspore.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
            shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
            tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains
            pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If
            `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
            (see `past_key_values`).
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
            for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`mindspore.Tensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
            in the correct position and to infer the complete sequence length.

    Returns:

    Example:

    ```python
    >>> from transformers import WhisperForCausalLM, WhisperForConditionalGeneration, WhisperProcessor
    >>> import torch
    >>> from datasets import load_dataset

    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

    >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2")

    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> sample = ds[0]["audio"]
    >>> input_features = processor(
    ...     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="ms"
    ... ).input_features

    >>> predicted_ids = model.generate(input_features, assistant_model=assistant_model)

    >>> # decode token ids to text
    >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    >>> transcription
    ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'
    ```"""
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # If the user passed a tuple or `BaseModelOutput` for encoder_outputs, we extract only the hidden states
    if isinstance(encoder_outputs, (BaseModelOutput, tuple, list)):
        encoder_outputs = encoder_outputs[0]

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model.decoder(
        input_ids=input_ids,
        attention_mask=attention_mask,
        encoder_hidden_states=encoder_outputs,
        head_mask=head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )

    logits = self.proj_out(outputs[0])

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithCrossAttentions(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        cross_attentions=outputs.cross_attentions,
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration` ¶

Bases: WhisperGenerationMixin, WhisperPreTrainedModel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedModel):
    base_model_prefix = "model"
    _tied_weights_keys = ["proj_out.weight"]

    def __init__(self, config: WhisperConfig):
        super().__init__(config)
        self.model = WhisperModel(config)
        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_encoder(self):
        return self.model.get_encoder()

    def get_decoder(self):
        return self.model.get_decoder()

    def get_output_embeddings(self):
        return self.proj_out

    def set_output_embeddings(self, new_embeddings):
        self.proj_out = new_embeddings

    def get_input_embeddings(self) -> nn.Module:
        return self.model.get_input_embeddings()

    def freeze_encoder(self):
        """
        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
        not be updated during training.
        """
        self.model.encoder._freeze_parameters()

    def forward(
        self,
        input_features: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        decoder_input_ids: Optional[mindspore.Tensor] = None,
        decoder_attention_mask: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        decoder_head_mask: Optional[mindspore.Tensor] = None,
        cross_attn_head_mask: Optional[mindspore.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[mindspore.Tensor]]] = None,
        decoder_inputs_embeds: Optional[Tuple[mindspore.Tensor]] = None,
        decoder_position_ids: Optional[Tuple[mindspore.Tensor]] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[mindspore.Tensor] = None,
    ) -> Union[Tuple[mindspore.Tensor], Seq2SeqLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="ms")
        >>> input_features = inputs.input_features

        >>> generated_ids = model.generate(inputs=input_features)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        outputs = self.model(
            input_features,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            decoder_inputs_embeds=decoder_inputs_embeds,
            decoder_position_ids=decoder_position_ids,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        lm_logits = self.proj_out(outputs[0])

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))

        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        use_cache=None,
        encoder_outputs=None,
        attention_mask=None,
        decoder_attention_mask=None,
        cache_position=None,
        **kwargs,
    ):
        decoder_position_ids = None
        if decoder_attention_mask is not None:
            decoder_position_ids = (decoder_attention_mask.cumsum(-1) - 1).clamp(min=0)

        past_length = 0
        if past_key_values is not None:
            if isinstance(past_key_values, EncoderDecoderCache):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
            else:
                past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

            if decoder_position_ids is not None:
                decoder_position_ids = decoder_position_ids[:, remove_prefix_length:]

        if cache_position is None:
            cache_position = ops.arange(
                past_length, past_length + decoder_input_ids.shape[1]
            )
        elif use_cache:
            cache_position = cache_position[-decoder_input_ids.shape[1] :]

        if (
            isinstance(past_key_values, EncoderDecoderCache)
            and (
                isinstance(past_key_values.self_attention_cache, StaticCache)
                or isinstance(past_key_values.cross_attention_cache, StaticCache)
            )
            and decoder_attention_mask is not None
            and decoder_attention_mask.ndim == 2
        ):
            batch_size, sequence_length = decoder_input_ids.shape

            dtype = self.proj_out.weight.dtype
            min_dtype = float(ops.finfo(dtype).min)

            decoder_attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
                decoder_attention_mask,
                sequence_length=sequence_length,
                target_length=past_key_values.self_attention_cache.get_max_length(),
                dtype=dtype,
                min_dtype=min_dtype,
                cache_position=cache_position,
                batch_size=batch_size,
            )

        return {
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "use_cache": use_cache,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_position_ids": decoder_position_ids,
            "cache_position": cache_position,
        }

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration.forward(input_features=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None, encoder_outputs=None, past_key_values=None, decoder_inputs_embeds=None, decoder_position_ids=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, cache_position=None)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the language modeling loss. Indices should either be in [0, ..., config.vocab_size] or -100 (see input_ids docstring). Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size].

Returns:

Example:

>>> import torch
>>> from transformers import AutoProcessor, WhisperForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="ms")
>>> input_features = inputs.input_features

>>> generated_ids = model.generate(inputs=input_features)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_features: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    decoder_input_ids: Optional[mindspore.Tensor] = None,
    decoder_attention_mask: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    decoder_head_mask: Optional[mindspore.Tensor] = None,
    cross_attn_head_mask: Optional[mindspore.Tensor] = None,
    encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    past_key_values: Optional[Union[EncoderDecoderCache, Tuple[mindspore.Tensor]]] = None,
    decoder_inputs_embeds: Optional[Tuple[mindspore.Tensor]] = None,
    decoder_position_ids: Optional[Tuple[mindspore.Tensor]] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[mindspore.Tensor] = None,
) -> Union[Tuple[mindspore.Tensor], Seq2SeqLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
        or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
        only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Returns:

    Example:

    ```python
    >>> import torch
    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
    >>> from datasets import load_dataset

    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="ms")
    >>> input_features = inputs.input_features

    >>> generated_ids = model.generate(inputs=input_features)

    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    >>> transcription
    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
    ```"""
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if labels is not None:
        if decoder_input_ids is None and decoder_inputs_embeds is None:
            decoder_input_ids = shift_tokens_right(
                labels, self.config.pad_token_id, self.config.decoder_start_token_id
            )

    outputs = self.model(
        input_features,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        encoder_outputs=encoder_outputs,
        decoder_attention_mask=decoder_attention_mask,
        head_mask=head_mask,
        decoder_head_mask=decoder_head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        past_key_values=past_key_values,
        decoder_inputs_embeds=decoder_inputs_embeds,
        decoder_position_ids=decoder_position_ids,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )
    lm_logits = self.proj_out(outputs[0])

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))

    if not return_dict:
        output = (lm_logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return Seq2SeqLMOutput(
        loss=loss,
        logits=lm_logits,
        past_key_values=outputs.past_key_values,
        decoder_hidden_states=outputs.decoder_hidden_states,
        decoder_attentions=outputs.decoder_attentions,
        cross_attentions=outputs.cross_attentions,
        encoder_last_hidden_state=outputs.encoder_last_hidden_state,
        encoder_hidden_states=outputs.encoder_hidden_states,
        encoder_attentions=outputs.encoder_attentions,
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration.freeze_encoder()` ¶

Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will not be updated during training.

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def freeze_encoder(self):
    """
    Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
    not be updated during training.
    """
    self.model.encoder._freeze_parameters()

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel` ¶

Bases: WhisperPreTrainedModel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperModel(WhisperPreTrainedModel):
    def __init__(self, config: WhisperConfig):
        super().__init__(config)

        self.encoder = WhisperEncoder(config)
        self.decoder = WhisperDecoder(config)
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    def set_input_embeddings(self, value):
        self.decoder.embed_tokens = value

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def freeze_encoder(self):
        """
        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
        not be updated during training.
        """
        self.encoder._freeze_parameters()

    def _mask_input_features(
        self,
        input_features: mindspore.Tensor,
        attention_mask: Optional[mindspore.Tensor] = None,
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        if not getattr(self.config, "apply_spec_augment", True):
            return input_features

        # generate indices & apply SpecAugment along time axis
        batch_size, hidden_size, sequence_length = input_features.shape

        if self.config.mask_time_prob > 0 and self.training:
            # generate indices & apply SpecAugment along time axis
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_)
            mask_time_indices = mask_time_indices[:, None].broadcast_to((-1, hidden_size, -1))
            input_features[mask_time_indices] = 0

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_)
            input_features[mask_feature_indices] = 0

        return input_features

    def forward(
        self,
        input_features: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        decoder_input_ids: Optional[mindspore.Tensor] = None,
        decoder_attention_mask: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        decoder_head_mask: Optional[mindspore.Tensor] = None,
        cross_attn_head_mask: Optional[mindspore.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[mindspore.Tensor]]] = None,
        decoder_inputs_embeds: Optional[Tuple[mindspore.Tensor]] = None,
        decoder_position_ids: Optional[Tuple[mindspore.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[mindspore.Tensor] = None,
    ) -> Union[Tuple[mindspore.Tensor], Seq2SeqModelOutput]:
        r"""
        Returns:

        Example:
         ```python
         >>> import torch
         >>> from transformers import AutoFeatureExtractor, WhisperModel
         >>> from datasets import load_dataset

         >>> model = WhisperModel.from_pretrained("openai/whisper-base")
         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="ms")
         >>> input_features = inputs.input_features
         >>> decoder_input_ids = mindspore.tensor([[1, 1]]) * model.config.decoder_start_token_id
         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
         >>> list(last_hidden_state.shape)
         [1, 2, 512]
         ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if encoder_outputs is None:
            input_features = self._mask_input_features(input_features, attention_mask=attention_mask)

            encoder_outputs = self.encoder(
                input_features,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_outputs[0],
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds,
            position_ids=decoder_position_ids,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        if not return_dict:
            return decoder_outputs + encoder_outputs

        return Seq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel.forward(input_features=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None, encoder_outputs=None, past_key_values=None, decoder_inputs_embeds=None, decoder_position_ids=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, cache_position=None)` ¶

Example

>>> import torch
>>> from transformers import AutoFeatureExtractor, WhisperModel
>>> from datasets import load_dataset

>>> model = WhisperModel.from_pretrained("openai/whisper-base")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="ms")
>>> input_features = inputs.input_features
>>> decoder_input_ids = mindspore.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 512]

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def forward(
    self,
    input_features: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    decoder_input_ids: Optional[mindspore.Tensor] = None,
    decoder_attention_mask: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    decoder_head_mask: Optional[mindspore.Tensor] = None,
    cross_attn_head_mask: Optional[mindspore.Tensor] = None,
    encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    past_key_values: Optional[Union[EncoderDecoderCache, Tuple[mindspore.Tensor]]] = None,
    decoder_inputs_embeds: Optional[Tuple[mindspore.Tensor]] = None,
    decoder_position_ids: Optional[Tuple[mindspore.Tensor]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[mindspore.Tensor] = None,
) -> Union[Tuple[mindspore.Tensor], Seq2SeqModelOutput]:
    r"""
    Returns:

    Example:
     ```python
     >>> import torch
     >>> from transformers import AutoFeatureExtractor, WhisperModel
     >>> from datasets import load_dataset

     >>> model = WhisperModel.from_pretrained("openai/whisper-base")
     >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
     >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
     >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="ms")
     >>> input_features = inputs.input_features
     >>> decoder_input_ids = mindspore.tensor([[1, 1]]) * model.config.decoder_start_token_id
     >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
     >>> list(last_hidden_state.shape)
     [1, 2, 512]
     ```"""
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if encoder_outputs is None:
        input_features = self._mask_input_features(input_features, attention_mask=attention_mask)

        encoder_outputs = self.encoder(
            input_features,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
        encoder_outputs = BaseModelOutput(
            last_hidden_state=encoder_outputs[0],
            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
        )

    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
    decoder_outputs = self.decoder(
        input_ids=decoder_input_ids,
        attention_mask=decoder_attention_mask,
        encoder_hidden_states=encoder_outputs[0],
        head_mask=decoder_head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        past_key_values=past_key_values,
        inputs_embeds=decoder_inputs_embeds,
        position_ids=decoder_position_ids,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )

    if not return_dict:
        return decoder_outputs + encoder_outputs

    return Seq2SeqModelOutput(
        last_hidden_state=decoder_outputs.last_hidden_state,
        past_key_values=decoder_outputs.past_key_values,
        decoder_hidden_states=decoder_outputs.hidden_states,
        decoder_attentions=decoder_outputs.attentions,
        cross_attentions=decoder_outputs.cross_attentions,
        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
        encoder_hidden_states=encoder_outputs.hidden_states,
        encoder_attentions=encoder_outputs.attentions,
    )

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel.freeze_encoder()` ¶

Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will not be updated during training.

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def freeze_encoder(self):
    """
    Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
    not be updated during training.
    """
    self.encoder._freeze_parameters()

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperPreTrainedModel` ¶

Bases: PreTrainedModel

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

class WhisperPreTrainedModel(PreTrainedModel):
    config_class = WhisperConfig
    base_model_prefix = "model"
    main_input_name = "input_features"
    supports_gradient_checkpointing = True
    _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight[module.padding_idx] = 0
        elif isinstance(module, WhisperEncoder):
            with no_grad():
                embed_positions = module.embed_positions.weight
                embed_positions.assign_value(sinusoids(*embed_positions.shape))

    def _get_feat_extract_output_lengths(self, input_lengths: mindspore.Tensor):
        """
        Computes the output length of the convolutional layers
        """
        input_lengths = (input_lengths - 1) // 2 + 1

        return input_lengths

`mindnlp.transformers.models.whisper.modeling_whisper.shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id)` ¶

Shift input ids one token to the right.

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def shift_tokens_right(input_ids: mindspore.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids

`mindnlp.transformers.models.whisper.modeling_whisper.sinusoids(length, channels, max_timescale=10000)` ¶

Returns sinusoids for positional embedding

Source code in mindnlp\transformers\models\whisper\modeling_whisper.py

def sinusoids(length: int, channels: int, max_timescale: float = 10000) -> mindspore.Tensor:
    """Returns sinusoids for positional embedding"""
    if channels % 2 != 0:
        raise ValueError(
            f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels."
        )
    log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
    inv_timescales = ops.exp(-log_timescale_increment * ops.arange(channels // 2))
    scaled_time = ops.arange(length).view(-1, 1) * inv_timescales.view(1, -1)
    return ops.cat([scaled_time.sin(), scaled_time.cos()], dim=1)

`mindnlp.transformers.models.whisper.tokenization_whisper` ¶

Tokenization classes for Whisper.

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer` ¶

Bases: PreTrainedTokenizer

Construct a Whisper tokenizer.

This tokenizer inherits from [PreTrainedTokenizer] which contains some of the main methods. Users should refer to the superclass for more information regarding such methods.

PARAMETER	DESCRIPTION
`vocab_file`	Path to the vocabulary file. TYPE: `str`
`merges_file`	Path to the merges file. TYPE: `str`
`normalizer_file`	Path to the normalizer_file file. TYPE: `str`, optional DEFAULT: `None`
`errors`	Paradigm to follow when decoding bytes to UTF-8. See bytes.decode for more information. TYPE: `str`, optional, defaults to `"replace"` DEFAULT: `'replace'`
`unk_token`	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`bos_token`	The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as `"<\|startoftranscript\|>"` when generating. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`eos_token`	The end of sequence token. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`pad_token`	The token used for padding, for example when batching sequences of different lengths. TYPE: `str`, optional DEFAULT: `None`
`add_prefix_space`	Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`language`	The language of the transcription text. The corresponding language id token is appended to the start of the sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token `"<\|es\|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only. TYPE: `str`, optional DEFAULT: `None`
`task`	Task identifier to append at the start of sequence (if any). This should be used for mulitlingual fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation. TYPE: `str`, optional DEFAULT: `None`
`predict_timestamps`	Whether to omit the `<\|notimestamps\|>` token at the start of the sequence. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

class WhisperTokenizer(PreTrainedTokenizer):
    """
    Construct a Whisper tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
    the superclass for more information regarding such methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalizer_file (`str`, *optional*):
            Path to the normalizer_file file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
            `"<|startoftranscript|>"` when generating.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*):
            The token used for padding, for example when batching sequences of different lengths.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.
        language (`str`, *optional*):
            The language of the transcription text. The corresponding language id token is appended to the start of the
            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
        task (`str`, *optional*):
            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
        predict_timestamps (`bool`, *optional*, defaults to `False`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        merges_file,
        normalizer_file=None,
        errors="replace",
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        pad_token=None,
        add_prefix_space=False,
        language=None,
        task=None,
        predict_timestamps=False,
        **kwargs,
    ):
        bos_token = (
            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(bos_token, str)
            else bos_token
        )
        eos_token = (
            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(eos_token, str)
            else eos_token
        )
        unk_token = (
            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(unk_token, str)
            else unk_token
        )
        pad_token = (
            AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(pad_token, str)
            else pad_token
        )

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}
        self.add_prefix_space = add_prefix_space

        if normalizer_file is not None:
            with open(normalizer_file, encoding="utf-8") as vocab_handle:
                self.english_spelling_normalizer = json.load(vocab_handle)
        else:
            self.english_spelling_normalizer = None

        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")

        self.language = language
        super().__init__(
            errors=errors,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        self.task = task
        self.predict_timestamps = predict_timestamps

    @property
    def vocab_size(self) -> int:
        return len(self.encoder)

    def get_vocab(self):
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
        """
        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
        update the prefix tokens as required when fine-tuning. Example:

        ```python
        >>> # instantiate the tokenizer and set the prefix token to Spanish
        >>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
        >>> # now switch the prefix token from Spanish to French
        >>> tokenizer.set_prefix_tokens(language="french")
        ```

        Args:
            language (`str`, *optional*, defaults to `None`):
                The language of the transcription text.
            task (`str`, *optional*, defaults to `None`):
                Task identifier to append at the start of sequence (if any).
            predict_timestamps (`bool`, *optional*, defaults to `None`):
                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
        """
        self.language = language if language is not None else self.language
        self.task = task if task is not None else self.task
        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps

    @property
    def prefix_tokens(self) -> List[int]:
        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
        langs = tuple(LANGUAGES.keys())

        if self.language is not None:
            self.language = self.language.lower()
            if self.language in TO_LANGUAGE_CODE:
                language_id = TO_LANGUAGE_CODE[self.language]
            elif self.language in TO_LANGUAGE_CODE.values():
                language_id = self.language
            else:
                is_language_code = len(self.language) == 2
                raise ValueError(
                    f"Unsupported language: {self.language}. Language should be one of:"
                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                )

        if self.task is not None:
            if self.task not in TASK_IDS:
                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")

        bos_sequence = [bos_token_id]
        if self.language is not None:
            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
        if self.task is not None:
            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
        if not self.predict_timestamps:
            bos_sequence.append(notimestamps_token_id)
        return bos_sequence

    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
        # We don't expect to process pairs, but leave the pair logic for API consistency
        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1]
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize with GPT2 -> Whisper
    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id with GPT2 -> Whisper
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """
        Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV
        tokens as "", thus we do not use the `unk_token` here.
        """
        return self.decoder.get(index, "")

    def _normalize(self, text):
        warnings.warn(
            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
        )
        return self.normalize(text)

    def _basic_normalize(self, text, remove_diacritics=False):
        warnings.warn(
            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
        )
        return self.basic_normalize(text, remove_diacritics=remove_diacritics)

    def normalize(self, text):
        """
        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
        english text.
        """
        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
        return normalizer(text)

    @staticmethod
    def basic_normalize(text, remove_diacritics=False):
        """
        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
        multilingual text.
        """
        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
        return normalizer(text)

    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
        """
        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        """
        timestamp_begin = self.all_special_ids[-1] + 1
        outputs = [[]]

        cur_max_timestamp = 0.0
        prev_segments_len = 0.0

        for token in token_ids:
            if token >= timestamp_begin:
                timestamp = float((token - timestamp_begin) * time_precision)

                if timestamp < cur_max_timestamp:
                    # next segment has started
                    prev_segments_len += cur_max_timestamp

                cur_max_timestamp = timestamp

                outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
                outputs.append([])
            else:
                outputs[-1].append(token)
        outputs = [
            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
        ]
        return "".join(outputs)

    def _compute_offsets(self, token_ids, time_precision=0.02):
        """
        Compute offsets for a given tokenized input

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        offsets = []
        # ensure torch tensor of token ids is placed on cpu
        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
            token_ids = token_ids.cpu()
        token_ids = np.array(token_ids)
        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
            raise ValueError("Can only process a single input at a time")
        timestamp_begin = self.all_special_ids[-1] + 1
        timestamp_tokens = token_ids >= timestamp_begin

        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
            # either there are no timestamps or there are no consecutive ones
            return []
        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
            # we add the final timestamp if it is not already in the list
            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)

        last_slice = np.where(timestamp_tokens)[0][0]
        for current_slice in consecutive:
            sliced_tokens = token_ids[last_slice:current_slice]
            if len(sliced_tokens) > 1:
                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
                # strip timestamp tokens from the text output
                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
                text = self._decode(sliced_tokens)
                text = self._filter_timestamp_ids(text)
                offsets.append(
                    {
                        "text": text,
                        "timestamp": (
                            start_timestamp_position * time_precision,
                            end_timestamp_position * time_precision,
                        ),
                    }
                )
            last_slice = current_slice

        return offsets

    @lru_cache
    def timestamp_ids(self, time_precision=0.02):
        """
        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

        Args:
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])

    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
        """
        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
                removed.
        """
        if skip_special_tokens:
            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)

        return token_ids

    def _filter_timestamp_ids(self, token_ids):
        return re.sub(self.timestamp_pat, "", token_ids)

    def decode(
        self,
        token_ids,
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        output_offsets: bool = False,
        time_precision: float = 0.02,
        decode_with_timestamps: bool = False,
        normalize: bool = False,
        basic_normalize: bool = False,
        remove_diacritics: bool = False,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            output_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                timestamps.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
                Whether or not to decode with timestamps included in the raw text.
            normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
                target text is in English. Otherwise, the basic text normalizer should be applied.
            basic_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
                target text.
            remove_diacritics (`bool`, *optional*, defaults to `False`):
                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
                destroy information in the decoded text, hence it should be used with caution.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.
        Returns:
            `str`: The decoded sentence.
        """
        filtered_ids = self._preprocess_token_ids(
            token_ids,
            skip_special_tokens=skip_special_tokens,
        )

        text = super().decode(
            filtered_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            normalize=normalize,
            basic_normalize=basic_normalize,
            remove_diacritics=remove_diacritics,
            **kwargs,
        )
        if decode_with_timestamps:
            # legacy method to decode timestamps when not included in the tokenizer vocabulary
            text = self._decode_with_timestamps(
                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
            )
        else:
            text = self._filter_timestamp_ids(text)

        # retrieve offsets
        if output_offsets:
            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
            return {"text": text, "offsets": offsets}
        return text

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        normalize: bool = False,
        basic_normalize: bool = False,
        remove_diacritics: bool = False,
        **kwargs,
    ) -> str:
        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

        # To avoid mixing byte-level and unicode for byte-level BPT
        # we need to build string separately for added tokens and byte-level tokens
        # cf. https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []
        for token in filtered_tokens:
            if skip_special_tokens and token in self.all_special_ids:
                continue
            if token in self.added_tokens_encoder:
                if current_sub_text:
                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                    current_sub_text = []
                sub_texts.append(token)
            else:
                current_sub_text.append(token)
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))

        text = "".join(sub_texts)

        if normalize:
            clean_text = self.normalize(text)
            return clean_text
        elif basic_normalize:
            clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics)
            return clean_text
        else:
            return text

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        text = "".join(tokens)
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )
        normalizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
        )

        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        if self.english_spelling_normalizer is not None:
            with open(normalizer_file, "w", encoding="utf-8") as f:
                f.write(
                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                )

        return vocab_file, merge_file, normalizer_file

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.prepare_for_tokenization with GPT2 -> Whisper
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if is_split_into_words or add_prefix_space:
            text = " " + text
        return (text, kwargs)

    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
        # we don't want to force the bos token at position 1, as this is the starting token
        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
        # to get the forced tokens
        forced_tokens = self.prefix_tokens[1:]
        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
        return forced_decoder_ids

    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
        return _decode_asr(
            self,
            model_outputs,
            return_timestamps=return_timestamps,
            return_language=return_language,
            time_precision=time_precision,
        )

    def get_prompt_ids(self, text: str, return_tensors="np"):
        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)

        # Check for special tokens
        prompt_text_ids = batch_encoding["input_ids"][1:]
        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
        if special_token_id is not None:
            token = self.convert_ids_to_tokens(special_token_id)
            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")

        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
        return batch_encoding["input_ids"]

    def _strip_prompt(self, token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
        if not isinstance(token_ids, list):
            token_ids = self._convert_to_list(token_ids)

        # handle case of empty token_ids for decoding with timestamps.
        # at this point token_ids is a list, so it is safe to use if not check.
        if not token_ids:
            return token_ids

        has_prompt = token_ids[0] == prompt_token_id
        if has_prompt:
            if decoder_start_token_id in token_ids:
                return token_ids[token_ids.index(decoder_start_token_id) :]
            else:
                return []

        return token_ids

    @staticmethod
    def _convert_to_list(token_ids):
        # convert type to ndarray if necessary
        if hasattr(token_ids, "numpy"):
            if "torch" in str(type(token_ids)):
                token_ids = token_ids.cpu().numpy()
            elif "tensorflow" in str(type(token_ids)):
                token_ids = token_ids.numpy()
        # now the token ids are either a numpy array, or a list of lists
        if isinstance(token_ids, np.ndarray):
            token_ids = token_ids.tolist()
        return token_ids

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize(text, remove_diacritics=False)` `staticmethod` ¶

Normalize a given string using the BasicTextNormalizer class, which preforms commons transformation on multilingual text.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

@staticmethod
def basic_normalize(text, remove_diacritics=False):
    """
    Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
    multilingual text.
    """
    normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
    return normalizer(text)

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)` ¶

Build model inputs from a sequence by appending eos_token_id.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
    """Build model inputs from a sequence by appending eos_token_id."""
    if token_ids_1 is None:
        return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
    # We don't expect to process pairs, but leave the pair logic for API consistency
    return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.convert_tokens_to_string(tokens)` ¶

Converts a sequence of tokens (string) in a single string.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def convert_tokens_to_string(self, tokens):
    """Converts a sequence of tokens (string) in a single string."""
    text = "".join(tokens)
    text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
    return text

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode(token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, output_offsets=False, time_precision=0.02, decode_with_timestamps=False, normalize=False, basic_normalize=False, remove_diacritics=False, **kwargs)` ¶

Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.

Similar to doing self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids)).

PARAMETER	DESCRIPTION
`token_ids`	List of tokenized input ids. Can be obtained using the `__call__` method. TYPE: `Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`
`skip_special_tokens`	Whether or not to remove special tokens in the decoding. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`clean_up_tokenization_spaces`	Whether or not to clean up the tokenization spaces. If `None`, will default to `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). TYPE: `bool`, optional DEFAULT: `None`
`output_offsets`	Whether or not to output the offsets of the tokens. This should only be set if the model predicted timestamps. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`time_precision`	The time ratio to convert from token to time. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`
`decode_with_timestamps`	Whether or not to decode with timestamps included in the raw text. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`normalize`	Whether or not to apply the English text normalizer to the decoded text. Only applicable when the target text is in English. Otherwise, the basic text normalizer should be applied. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`basic_normalize`	Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual target text. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`remove_diacritics`	Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may destroy information in the decoded text, hence it should be used with caution. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`kwargs`	Will be passed to the underlying model specific decode method. TYPE: `additional keyword arguments, optional` DEFAULT: `{}`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def decode(
    self,
    token_ids,
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = None,
    output_offsets: bool = False,
    time_precision: float = 0.02,
    decode_with_timestamps: bool = False,
    normalize: bool = False,
    basic_normalize: bool = False,
    remove_diacritics: bool = False,
    **kwargs,
) -> str:
    """
    Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
    tokens and clean up tokenization spaces.

    Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

    Args:
        token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
            List of tokenized input ids. Can be obtained using the `__call__` method.
        skip_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to remove special tokens in the decoding.
        clean_up_tokenization_spaces (`bool`, *optional*):
            Whether or not to clean up the tokenization spaces. If `None`, will default to
            `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
        output_offsets (`bool`, *optional*, defaults to `False`):
            Whether or not to output the offsets of the tokens. This should only be set if the model predicted
            timestamps.
        time_precision (`float`, *optional*, defaults to 0.02):
            The time ratio to convert from token to time.
        decode_with_timestamps (`bool`, *optional*, defaults to `False`):
            Whether or not to decode with timestamps included in the raw text.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
            target text is in English. Otherwise, the basic text normalizer should be applied.
        basic_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
            target text.
        remove_diacritics (`bool`, *optional*, defaults to `False`):
            Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
            destroy information in the decoded text, hence it should be used with caution.
        kwargs (additional keyword arguments, *optional*):
            Will be passed to the underlying model specific decode method.
    Returns:
        `str`: The decoded sentence.
    """
    filtered_ids = self._preprocess_token_ids(
        token_ids,
        skip_special_tokens=skip_special_tokens,
    )

    text = super().decode(
        filtered_ids,
        skip_special_tokens=skip_special_tokens,
        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        normalize=normalize,
        basic_normalize=basic_normalize,
        remove_diacritics=remove_diacritics,
        **kwargs,
    )
    if decode_with_timestamps:
        # legacy method to decode timestamps when not included in the tokenizer vocabulary
        text = self._decode_with_timestamps(
            filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
        )
    else:
        text = self._filter_timestamp_ids(text)

    # retrieve offsets
    if output_offsets:
        offsets = self._compute_offsets(token_ids, time_precision=time_precision)
        return {"text": text, "offsets": offsets}
    return text

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids(text, return_tensors='np')` ¶

Converts prompt text to IDs that can be passed to [~WhisperForConditionalGeneration.generate].

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def get_prompt_ids(self, text: str, return_tensors="np"):
    """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
    batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)

    # Check for special tokens
    prompt_text_ids = batch_encoding["input_ids"][1:]
    special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
    if special_token_id is not None:
        token = self.convert_ids_to_tokens(special_token_id)
        raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")

    batch_encoding.convert_to_tensors(tensor_type=return_tensors)
    return batch_encoding["input_ids"]

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)` ¶

Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer prepare_for_model method.

PARAMETER	DESCRIPTION
`token_ids_0`	List of IDs. TYPE: `List[int]`
`token_ids_1`	Optional second list of IDs for sequence pairs. TYPE: `List[int]`, optional DEFAULT: `None`
`already_has_special_tokens`	Whether or not the token list is already formatted with special tokens for the model. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`

RETURNS	DESCRIPTION
`List[int]`	`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
    """
    Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer `prepare_for_model` method.

    Args:
        token_ids_0 (`List[int]`):
            List of IDs.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.
        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not the token list is already formatted with special tokens for the model.

    Returns:
        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """

    if already_has_special_tokens:
        return super().get_special_tokens_mask(
            token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
        )

    prefix_ones = [1] * len(self.prefix_tokens)
    suffix_ones = [1]
    if token_ids_1 is None:
        return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
    return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize(text)` ¶

Normalize a given string using the EnglishTextNormalizer class, which preforms commons transformation on english text.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def normalize(self, text):
    """
    Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
    english text.
    """
    normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
    return normalizer(text)

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.set_prefix_tokens(language=None, task=None, predict_timestamps=None)` ¶

Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to update the prefix tokens as required when fine-tuning. Example:

>>> # instantiate the tokenizer and set the prefix token to Spanish
>>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
>>> # now switch the prefix token from Spanish to French
>>> tokenizer.set_prefix_tokens(language="french")

PARAMETER	DESCRIPTION
`language`	The language of the transcription text. TYPE: `str`, optional, defaults to `None` DEFAULT: `None`
`task`	Task identifier to append at the start of sequence (if any). TYPE: `str`, optional, defaults to `None` DEFAULT: `None`
`predict_timestamps`	Whether to omit the `<\|notimestamps\|>` token at the start of the sequence. TYPE: `bool`, optional, defaults to `None` DEFAULT: `None`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
    """
    Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
    update the prefix tokens as required when fine-tuning. Example:

    ```python
    >>> # instantiate the tokenizer and set the prefix token to Spanish
    >>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
    >>> # now switch the prefix token from Spanish to French
    >>> tokenizer.set_prefix_tokens(language="french")
    ```

    Args:
        language (`str`, *optional*, defaults to `None`):
            The language of the transcription text.
        task (`str`, *optional*, defaults to `None`):
            Task identifier to append at the start of sequence (if any).
        predict_timestamps (`bool`, *optional*, defaults to `None`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    """
    self.language = language if language is not None else self.language
    self.task = task if task is not None else self.task
    self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids(time_precision=0.02)` `cached` ¶

Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

PARAMETER	DESCRIPTION
`time_precision`	The time ratio to convert from token to time. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

@lru_cache
def timestamp_ids(self, time_precision=0.02):
    """
    Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

    Args:
        time_precision (`float`, *optional*, defaults to 0.02):
            The time ratio to convert from token to time.
    """
    return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])

`mindnlp.transformers.models.whisper.tokenization_whisper.bytes_to_unicode()` ¶

Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

`mindnlp.transformers.models.whisper.tokenization_whisper.get_pairs(word)` ¶

Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).

Source code in mindnlp\transformers\models\whisper\tokenization_whisper.py

def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

`mindnlp.transformers.models.whisper.processing_whisper` ¶

Speech processor class for Whisper

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor` ¶

Bases: ProcessorMixin

Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single processor.

[WhisperProcessor] offers all the functionalities of [WhisperFeatureExtractor] and [WhisperTokenizer]. See the [~WhisperProcessor.__call__] and [~WhisperProcessor.decode] for more information.

PARAMETER	DESCRIPTION
`feature_extractor`	An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input. TYPE: `WhisperFeatureExtractor`
`tokenizer`	An instance of [`WhisperTokenizer`]. The tokenizer is a required input. TYPE: `WhisperTokenizer`

Source code in mindnlp\transformers\models\whisper\processing_whisper.py

class WhisperProcessor(ProcessorMixin):
    r"""
    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
    processor.

    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.

    Args:
        feature_extractor (`WhisperFeatureExtractor`):
            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`WhisperTokenizer`):
            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
    """

    feature_extractor_class = "WhisperFeatureExtractor"
    tokenizer_class = "WhisperTokenizer"

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)
        self.current_processor = self.feature_extractor
        self._in_target_context_manager = False

    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)

    def __call__(self, *args, **kwargs):
        """
        Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
        argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
        information.
        """
        # For backward compatibility
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        audio = kwargs.pop("audio", None)
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
            args = args[1:]

        if audio is None and text is None:
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            return inputs

        elif audio is None:
            return encodings
        else:
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def get_prompt_ids(self, text: str, return_tensors="np"):
        return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.call(*args, **kwargs)` ¶

Forwards the audio argument to WhisperFeatureExtractor's [~WhisperFeatureExtractor.__call__] and the text argument to [~WhisperTokenizer.__call__]. Please refer to the doctsring of the above two methods for more information.

Source code in mindnlp\transformers\models\whisper\processing_whisper.py

def __call__(self, *args, **kwargs):
    """
    Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
    argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
    information.
    """
    # For backward compatibility
    if self._in_target_context_manager:
        return self.current_processor(*args, **kwargs)

    audio = kwargs.pop("audio", None)
    sampling_rate = kwargs.pop("sampling_rate", None)
    text = kwargs.pop("text", None)
    if len(args) > 0:
        audio = args[0]
        args = args[1:]

    if audio is None and text is None:
        raise ValueError("You need to specify either an `audio` or `text` input to process.")

    if audio is not None:
        inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
    if text is not None:
        encodings = self.tokenizer(text, **kwargs)

    if text is None:
        return inputs

    elif audio is None:
        return encodings
    else:
        inputs["labels"] = encodings["input_ids"]
        return inputs

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode(*args, **kwargs)` ¶

This method forwards all its arguments to WhisperTokenizer's [~PreTrainedTokenizer.batch_decode]. Please refer to the docstring of this method for more information.

Source code in mindnlp\transformers\models\whisper\processing_whisper.py

def batch_decode(self, *args, **kwargs):
    """
    This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
    refer to the docstring of this method for more information.
    """
    return self.tokenizer.batch_decode(*args, **kwargs)

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.decode(*args, **kwargs)` ¶

This method forwards all its arguments to WhisperTokenizer's [~PreTrainedTokenizer.decode]. Please refer to the docstring of this method for more information.

Source code in mindnlp\transformers\models\whisper\processing_whisper.py

def decode(self, *args, **kwargs):
    """
    This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
    the docstring of this method for more information.
    """
    return self.tokenizer.decode(*args, **kwargs)

`mindnlp.transformers.models.whisper.configuration_whisper` ¶

Whisper model configuration

`mindnlp.transformers.models.whisper.configuration_whisper.WhisperConfig` ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [WhisperModel]. It is used to instantiate a Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Whisper openai/whisper-tiny architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER	DESCRIPTION
`vocab_size`	Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the `decoder_input_ids` passed when calling [`WhisperModel`] TYPE: `int`, optional, defaults to 51865 DEFAULT: `51865`
`num_mel_bins`	Number of mel features used per input features. Should correspond to the value used in the `WhisperProcessor` class. TYPE: `int`, optional, defaults to 80 DEFAULT: `80`
`encoder_layers`	Number of encoder layers. TYPE: `int`, optional, defaults to 4 DEFAULT: `4`
`decoder_layers`	Number of decoder layers. TYPE: `int`, optional, defaults to 4 DEFAULT: `4`
`encoder_attention_heads`	Number of attention heads for each attention layer in the Transformer encoder. TYPE: `int`, optional, defaults to 6 DEFAULT: `6`
`decoder_attention_heads`	Number of attention heads for each attention layer in the Transformer decoder. TYPE: `int`, optional, defaults to 6 DEFAULT: `6`
`encoder_ffn_dim`	Dimensionality of the "intermediate" (often named feed-forward) layer in encoder. TYPE: `int`, optional, defaults to 1536 DEFAULT: `1536`
`decoder_ffn_dim`	Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. TYPE: `int`, optional, defaults to 1536 DEFAULT: `1536`
`encoder_layerdrop`	The LayerDrop probability for the encoder. See the LayerDrop paper for more details. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`decoder_layerdrop`	The LayerDrop probability for the decoder. See the LayerDrop paper for more details. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`decoder_start_token_id`	Corresponds to the "<\|startoftranscript\|>" token, which is automatically used when no `decoder_input_ids` are provided to the `generate` function. It is used to guide the model`s generation process depending on the task. TYPE: `int`, optional, defaults to 50257 DEFAULT: `50257`
`use_cache`	Whether or not the model should return the last key/values attentions (not used by all models). TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`is_encoder_decoder`	Whether the model is used as an encoder/decoder or not. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`activation_function`	The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported. TYPE: `str`, optional, defaults to `"gelu"` DEFAULT: `'gelu'`
`d_model`	Dimensionality of the layers. TYPE: `int`, optional, defaults to 384 DEFAULT: `384`
`dropout`	The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. TYPE: `float`, optional, defaults to 0.1 DEFAULT: `0.0`
`attention_dropout`	The dropout ratio for the attention probabilities. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`activation_dropout`	The dropout ratio for activations inside the fully connected layer. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`init_std`	The standard deviation of the truncated_normal_initializer for initializing all weight matrices. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`
`scale_embedding`	Scale embeddings by diving by sqrt(d_model). TYPE: `bool`, optional, defaults to False DEFAULT: `False`
`max_source_positions`	The maximum sequence length of log-mel filter-bank features that this model might ever be used with. TYPE: `int`, optional, defaults to 1500 DEFAULT: `1500`
`max_target_positions`	The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). TYPE: `int`, optional, defaults to 448 DEFAULT: `448`
`pad_token_id`	Padding token id. TYPE: `int`, optional, defaults to 50256 DEFAULT: `50256`
`bos_token_id`	Begin of stream token id. TYPE: `int`, optional, defaults to 50256 DEFAULT: `50256`
`eos_token_id`	End of stream token id. TYPE: `int`, optional, defaults to 50256 DEFAULT: `50256`
`suppress_tokens`	A list containing the non-speech tokens that will be used by the logit processor in the `generate` function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the `multilingual` model. TYPE: `List[int]`, optional DEFAULT: `None`
`begin_suppress_tokens`	A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as the token for `" "` (`blank_token_id`) and the `eos_token_id` TYPE: `List[int]`, optional, defaults to `[220,50256]` DEFAULT: `[220, 50256]`
`use_weighted_layer_sum`	Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an instance of [`WhisperForAudioClassification`]. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`classifier_proj_size`	Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an instance of [`WhisperForAudioClassification`]. TYPE: `int`, optional, defaults to 256 DEFAULT: `256`
`apply_spec_augment`	Whether to apply SpecAugment data augmentation to the outputs of the feature encoder. For reference see SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`mask_time_prob`	Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking procecure generates `mask_time_problen(time_axis)/mask_time_length` independent masks over the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector span to be masked, mask_time_prob* should be `prob_vector_startmask_time_length`. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`. TYPE:* `float`, optional, defaults to 0.05 DEFAULT: `0.05`
`mask_time_length`	Length of vector span along the time axis. TYPE: `int`, optional, defaults to 10 DEFAULT: `10`
`mask_time_min_masks`	The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_problen(time_axis)/mask_time_length < mask_time_min_masks'' TYPE:* `int`, optional, defaults to 2), DEFAULT: `2`
`mask_feature_prob`	Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The masking procecure generates `mask_feature_problen(feature_axis)/mask_time_length` independent masks over the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector span to be masked, mask_feature_prob* should be `prob_vector_startmask_feature_length`. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. TYPE:* `float`, optional, defaults to 0.0 DEFAULT: `0.0`
`mask_feature_length`	Length of vector span along the feature axis. TYPE: `int`, optional, defaults to 10 DEFAULT: `10`
`mask_feature_min_masks`	The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if `mask_feature_problen(feature_axis)/mask_feature_length < mask_feature_min_masks`. TYPE:* `int`, optional, defaults to 0), DEFAULT: `0`
`median_filter_width`	Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps. Should be an odd number. TYPE: `int`, optional, defaults to 7 DEFAULT: `7`

>>> from transformers import WhisperConfig, WhisperModel

>>> # Initializing a Whisper tiny style configuration
>>> configuration = WhisperConfig()

>>> # Initializing a model (with random weights) from the tiny style configuration
>>> model = WhisperModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

Source code in mindnlp\transformers\models\whisper\configuration_whisper.py

class WhisperConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Whisper
    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 51865):
            Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
            `decoder_input_ids` passed when calling [`WhisperModel`]
        num_mel_bins (`int`, *optional*, defaults to 80):
            Number of mel features used per input features. Should correspond to the value used in the
            `WhisperProcessor` class.
        encoder_layers (`int`, *optional*, defaults to 4):
            Number of encoder layers.
        decoder_layers (`int`, *optional*, defaults to 4):
            Number of decoder layers.
        encoder_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_ffn_dim (`int`, *optional*, defaults to 1536):
            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        decoder_start_token_id (`int`, *optional*, defaults to 50257):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        d_model (`int`, *optional*, defaults to 384):
            Dimensionality of the layers.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        scale_embedding (`bool`, *optional*, defaults to False):
            Scale embeddings by diving by sqrt(d_model).
        max_source_positions (`int`, *optional*, defaults to 1500):
            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
        max_target_positions (`int`, *optional*, defaults to 448):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        pad_token_id (`int`, *optional*, defaults to 50256):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 50256):
            Begin of stream token id.
        eos_token_id (`int`, *optional*, defaults to 50256):
            End of stream token id.
        suppress_tokens (`List[int]`, *optional*):
            A list containing the non-speech tokens that will be used by the logit processor in the `generate`
            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
            `multilingual` model.
        begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
            the token for `" "` (`blank_token_id`) and the `eos_token_id`
        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
            instance of [`WhisperForAudioClassification`].
        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
            instance of [`WhisperForAudioClassification`].
        apply_spec_augment (`bool`, *optional*, defaults to `False`):
            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
            Recognition](https://arxiv.org/abs/1904.08779).
        mask_time_prob (`float`, *optional*, defaults to 0.05):
            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
            actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
        mask_time_length (`int`, *optional*, defaults to 10):
            Length of vector span along the time axis.
        mask_time_min_masks (`int`, *optional*, defaults to 2),:
            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
            mask_time_min_masks''
        mask_feature_prob (`float`, *optional*, defaults to 0.0):
            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
            True`.
        mask_feature_length (`int`, *optional*, defaults to 10):
            Length of vector span along the feature axis.
        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
            step, irrespectively of `mask_feature_prob`. Only relevant if
            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
        median_filter_width (`int`, *optional*, defaults to 7):
            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
            Should be an odd number.

    Example:

    ```python
    >>> from transformers import WhisperConfig, WhisperModel

    >>> # Initializing a Whisper tiny style configuration
    >>> configuration = WhisperConfig()

    >>> # Initializing a model (with random weights) from the tiny style configuration
    >>> model = WhisperModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "whisper"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_key_value_heads": "encoder_attention_heads",
        "num_attention_heads": "encoder_attention_heads",
        "hidden_size": "d_model",
    }

    def __init__(
        self,
        vocab_size=51865,
        num_mel_bins=80,
        encoder_layers=4,
        encoder_attention_heads=6,
        decoder_layers=4,
        decoder_attention_heads=6,
        decoder_ffn_dim=1536,
        encoder_ffn_dim=1536,
        encoder_layerdrop=0.0,
        decoder_layerdrop=0.0,
        decoder_start_token_id=50257,
        use_cache=True,
        is_encoder_decoder=True,
        activation_function="gelu",
        d_model=384,
        dropout=0.0,
        attention_dropout=0.0,
        activation_dropout=0.0,
        init_std=0.02,
        scale_embedding=False,
        max_source_positions=1500,
        max_target_positions=448,
        pad_token_id=50256,
        bos_token_id=50256,
        eos_token_id=50256,
        suppress_tokens=None,
        begin_suppress_tokens=[220, 50256],
        use_weighted_layer_sum=False,
        classifier_proj_size=256,
        apply_spec_augment=False,
        mask_time_prob=0.05,
        mask_time_length=10,
        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
        mask_feature_min_masks=0,
        median_filter_width=7,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.num_mel_bins = num_mel_bins
        self.d_model = d_model
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
        self.decoder_ffn_dim = decoder_ffn_dim
        self.encoder_ffn_dim = encoder_ffn_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
        self.max_source_positions = max_source_positions
        self.max_target_positions = max_target_positions

        # Audio Classification-specific parameters. Feel free to ignore for other classes.
        self.classifier_proj_size = classifier_proj_size
        self.use_weighted_layer_sum = use_weighted_layer_sum

        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
        self.mask_feature_min_masks = mask_feature_min_masks

        self.median_filter_width = median_filter_width

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            suppress_tokens=suppress_tokens,
            begin_suppress_tokens=begin_suppress_tokens,
            **kwargs,
        )

`mindnlp.transformers.models.whisper.feature_extraction_whisper` ¶

Feature extractor class for Whisper

`mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor` ¶

Bases: SequenceFeatureExtractor

Constructs a Whisper feature extractor.

This feature extractor inherits from [~feature_extraction_sequence_utils.SequenceFeatureExtractor] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the Short Time Fourier Transform which should match pytorch's ops.stft equivalent.

PARAMETER	DESCRIPTION
`feature_size`	The feature dimension of the extracted features. TYPE: `int`, optional, defaults to 80 DEFAULT: `80`
`sampling_rate`	The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). TYPE: `int`, optional, defaults to 16000 DEFAULT: `16000`
`hop_length`	Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients. TYPE: `int`, optional, defaults to 160 DEFAULT: `160`
`chunk_length`	The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio sequences. TYPE: `int`, optional, defaults to 30 DEFAULT: `30`
`n_fft`	Size of the Fourier transform. TYPE: `int`, optional, defaults to 400 DEFAULT: `400`
`padding_value`	Padding value used to pad the audio. Should correspond to silences. TYPE: `float`, optional, defaults to 0.0 DEFAULT: `0.0`

Source code in mindnlp\transformers\models\whisper\feature_extraction_whisper.py

class WhisperFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `ops.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    """

    model_input_names = ["input_features"]

    def __init__(
        self,
        feature_size=80,
        sampling_rate=16000,
        hop_length=160,
        chunk_length=30,
        n_fft=400,
        padding_value=0.0,
        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
        **kwargs,
    ):
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            return_attention_mask=return_attention_mask,
            **kwargs,
        )
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.chunk_length = chunk_length
        self.n_samples = chunk_length * sampling_rate
        self.nb_max_frames = self.n_samples // hop_length
        self.sampling_rate = sampling_rate
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=1 + n_fft // 2,
            num_mel_filters=feature_size,
            min_frequency=0.0,
            max_frequency=8000.0,
            sampling_rate=sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        )

    def _np_extract_fbank_features(self, waveform_batch: np.array) -> np.ndarray:
        """
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        """
        log_spec_batch = []
        for waveform in waveform_batch:
            log_spec = spectrogram(
                waveform,
                window_function(self.n_fft, "hann"),
                frame_length=self.n_fft,
                hop_length=self.hop_length,
                power=2.0,
                mel_filters=self.mel_filters,
                log_mel="log10",
            )
            log_spec = log_spec[:, :-1]
            log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
            log_spec = (log_spec + 4.0) / 4.0
            log_spec_batch.append(log_spec)
        log_spec_batch = np.array(log_spec_batch)
        return log_spec_batch


    @staticmethod
    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
    def zero_mean_unit_var_norm(
        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
    ) -> List[np.ndarray]:
        """
        Every array in the list is normalized to have zero mean and unit variance
        """
        if attention_mask is not None:
            attention_mask = np.array(attention_mask, np.int32)
            normed_input_values = []

            for vector, length in zip(input_values, attention_mask.sum(-1)):
                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
                if length < normed_slice.shape[0]:
                    normed_slice[length:] = padding_value

                normed_input_values.append(normed_slice)
        else:
            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]

        return normed_input_values

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = None,
        padding: Optional[str] = "max_length",
        max_length: Optional[int] = None,
        sampling_rate: Optional[int] = None,
        do_normalize: Optional[bool] = None,
        return_token_timestamps: Optional[bool] = None,
        **kwargs,
    ) -> BatchFeature:
        """
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'ms'`: Return MindSpore `mindspore.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        """

        if sampling_rate is not None:
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
                )
        else:
            logger.warning(
                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
                "Failing to do so can result in silent errors that might be hard to debug."
            )

        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
        if is_batched_numpy and len(raw_speech.shape) > 2:
            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
        is_batched = is_batched_numpy or (
            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
        )

        if is_batched:
            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
        elif not is_batched and not isinstance(raw_speech, np.ndarray):
            raw_speech = np.asarray(raw_speech, dtype=np.float32)
        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
            raw_speech = raw_speech.astype(np.float32)

        # always return batch
        if not is_batched:
            raw_speech = [np.asarray([raw_speech]).T]

        batched_speech = BatchFeature({"input_features": raw_speech})

        # convert into correct format for padding

        padded_inputs = self.pad(
            batched_speech,
            padding=padding,
            max_length=max_length if max_length else self.n_samples,
            truncation=truncation,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask or do_normalize,
        )

        # zero-mean and unit-variance normalization
        if do_normalize:
            padded_inputs["input_features"] = self.zero_mean_unit_var_norm(
                padded_inputs["input_features"],
                attention_mask=padded_inputs["attention_mask"],
                padding_value=self.padding_value,
            )
            padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)

        # make sure list is in array format
        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)

        extract_fbank_features = self._np_extract_fbank_features
        input_features = extract_fbank_features(input_features[0])

        if isinstance(input_features[0], List):
            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]

        else:
            padded_inputs["input_features"] = input_features

        if return_attention_mask:
            # rescale from sample (48000) to feature (3000)
            padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]

        if return_token_timestamps is not None:
            padded_inputs["num_frames"] = [len(raw_speech_i) // self.hop_length for raw_speech_i in raw_speech]

        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

        return padded_inputs

`mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.call(raw_speech, truncation=True, pad_to_multiple_of=None, return_tensors=None, return_attention_mask=None, padding='max_length', max_length=None, sampling_rate=None, do_normalize=None, return_token_timestamps=None, **kwargs)` ¶

Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for the STFT computation if available, otherwise a slower NumPy based one.

PARAMETER	DESCRIPTION
`raw_speech`	The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not stereo, i.e. single float per timestep. TYPE: `np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`
`truncation`	Activates truncation to cut input sequences longer than max_length to max_length. TYPE: `bool`, optional, default to `True` DEFAULT: `True`
`pad_to_multiple_of`	If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. TYPE: `int`, optional, defaults to None DEFAULT: `None`
`return_attention_mask`	Whether to return the attention mask. If left to the default, will return the attention mask according to the specific feature_extractor's default. What are attention masks? For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle bugs. TYPE: `bool`, optional DEFAULT: `None`
`return_tensors`	If set, will return tensors instead of list of python integers. Acceptable values are: `'ms'`: Return MindSpore `mindspore.Tensor` objects. `'np'`: Return Numpy `np.ndarray` objects. TYPE: `str` or [`~utils.TensorType`], optional DEFAULT: `None`
`sampling_rate`	The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition pipeline. TYPE: `int`, optional DEFAULT: `None`
`padding_value`	The value that is used to fill the padding values / vectors. TYPE: `float`, optional, defaults to 0.0
`do_normalize`	Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly improve the performance of the model. TYPE: `bool`, optional, defaults to `False` DEFAULT: `None`
`return_token_timestamps`	Whether or not to return the number of frames of the input raw_speech. These num_frames can be used by the model to compute word level timestamps. TYPE: `bool`, optional, defaults to `None` DEFAULT: `None`

Source code in mindnlp\transformers\models\whisper\feature_extraction_whisper.py

def __call__(
    self,
    raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
    truncation: bool = True,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_attention_mask: Optional[bool] = None,
    padding: Optional[str] = "max_length",
    max_length: Optional[int] = None,
    sampling_rate: Optional[int] = None,
    do_normalize: Optional[bool] = None,
    return_token_timestamps: Optional[bool] = None,
    **kwargs,
) -> BatchFeature:
    """
    Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
    the STFT computation if available, otherwise a slower NumPy based one.

    Args:
        raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
            The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
            values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
            stereo, i.e. single float per timestep.
        truncation (`bool`, *optional*, default to `True`):
            Activates truncation to cut input sequences longer than *max_length* to *max_length*.
        pad_to_multiple_of (`int`, *optional*, defaults to None):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
            `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
        return_attention_mask (`bool`, *optional*):
            Whether to return the attention mask. If left to the default, will return the attention mask according
            to the specific feature_extractor's default.

            [What are attention masks?](../glossary#attention-mask)

            <Tip>

            For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
            bugs.

            </Tip>

        return_tensors (`str` or [`~utils.TensorType`], *optional*):
            If set, will return tensors instead of list of python integers. Acceptable values are:

            - `'ms'`: Return MindSpore `mindspore.Tensor` objects.
            - `'np'`: Return Numpy `np.ndarray` objects.
        sampling_rate (`int`, *optional*):
            The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
            `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
            pipeline.
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values / vectors.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance of the model.
        return_token_timestamps (`bool`, *optional*, defaults to `None`):
            Whether or not to return the number of frames of the input raw_speech.
            These num_frames can be used by the model to compute word level timestamps.
    """

    if sampling_rate is not None:
        if sampling_rate != self.sampling_rate:
            raise ValueError(
                f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
                f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
                f" was sampled with {self.sampling_rate} and not {sampling_rate}."
            )
    else:
        logger.warning(
            "It is strongly recommended to pass the `sampling_rate` argument to this function. "
            "Failing to do so can result in silent errors that might be hard to debug."
        )

    is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
    if is_batched_numpy and len(raw_speech.shape) > 2:
        raise ValueError(f"Only mono-channel audio is supported for input to {self}")
    is_batched = is_batched_numpy or (
        isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
    )

    if is_batched:
        raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
    elif not is_batched and not isinstance(raw_speech, np.ndarray):
        raw_speech = np.asarray(raw_speech, dtype=np.float32)
    elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
        raw_speech = raw_speech.astype(np.float32)

    # always return batch
    if not is_batched:
        raw_speech = [np.asarray([raw_speech]).T]

    batched_speech = BatchFeature({"input_features": raw_speech})

    # convert into correct format for padding

    padded_inputs = self.pad(
        batched_speech,
        padding=padding,
        max_length=max_length if max_length else self.n_samples,
        truncation=truncation,
        pad_to_multiple_of=pad_to_multiple_of,
        return_attention_mask=return_attention_mask or do_normalize,
    )

    # zero-mean and unit-variance normalization
    if do_normalize:
        padded_inputs["input_features"] = self.zero_mean_unit_var_norm(
            padded_inputs["input_features"],
            attention_mask=padded_inputs["attention_mask"],
            padding_value=self.padding_value,
        )
        padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)

    # make sure list is in array format
    input_features = padded_inputs.get("input_features").transpose(2, 0, 1)

    extract_fbank_features = self._np_extract_fbank_features
    input_features = extract_fbank_features(input_features[0])

    if isinstance(input_features[0], List):
        padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]

    else:
        padded_inputs["input_features"] = input_features

    if return_attention_mask:
        # rescale from sample (48000) to feature (3000)
        padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]

    if return_token_timestamps is not None:
        padded_inputs["num_frames"] = [len(raw_speech_i) // self.hop_length for raw_speech_i in raw_speech]

    if return_tensors is not None:
        padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

    return padded_inputs

`mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.zero_mean_unit_var_norm(input_values, attention_mask, padding_value=0.0)` `staticmethod` ¶

Every array in the list is normalized to have zero mean and unit variance

Source code in mindnlp\transformers\models\whisper\feature_extraction_whisper.py

@staticmethod
# Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
def zero_mean_unit_var_norm(
    input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
) -> List[np.ndarray]:
    """
    Every array in the list is normalized to have zero mean and unit variance
    """
    if attention_mask is not None:
        attention_mask = np.array(attention_mask, np.int32)
        normed_input_values = []

        for vector, length in zip(input_values, attention_mask.sum(-1)):
            normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
            if length < normed_slice.shape[0]:
                normed_slice[length:] = padding_value

            normed_input_values.append(normed_slice)
    else:
        normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]

    return normed_input_values

`mindnlp.transformers.models.whisper.tokenization_whisper_fast` ¶

Tokenization classes for Whisper.

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast` ¶

Bases: PreTrainedTokenizerFast

Construct a "fast" Whisper tokenizer (backed by HuggingFace's tokenizers library).

This tokenizer inherits from [PreTrainedTokenizerFast] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

PARAMETER	DESCRIPTION
`vocab_file`	Path to the vocabulary file. TYPE: `str`, optional DEFAULT: `None`
`merges_file`	Path to the merges file. TYPE: `str`, optional DEFAULT: `None`
`normalizer_file`	Path to the normalizer_file file. TYPE: `str`, optional DEFAULT: `None`
`tokenizer_file`	Path to tokenizers file (generally has a .json extension) that contains everything needed to load the tokenizer. TYPE: `str`, optional DEFAULT: `None`
`unk_token`	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`bos_token`	The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as `"<\|startoftranscript\|>"` when generating. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`eos_token`	The end of sequence token. TYPE: `str`, optional, defaults to `"<\|endoftext\|>"` DEFAULT: `'<\|endoftext\|>'`
`add_prefix_space`	Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (Whisper tokenizer detect beginning of words by the preceding space). TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`language`	The language of the transcription text. The corresponding language id token is appended to the start of the sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token `"<\|es\|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only. TYPE: `str`, optional DEFAULT: `None`
`task`	Task identifier to append at the start of sequence (if any). This should be used for mulitlingual fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation. TYPE: `str`, optional DEFAULT: `None`
`predict_timestamps`	Whether to omit the `<\|notimestamps\|>` token at the start of the sequence. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

class WhisperTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        normalizer_file (`str`, *optional*):
            Path to the normalizer_file file.
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
            `"<|startoftranscript|>"` when generating.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (Whisper tokenizer detect beginning of words by the preceding space).
        language (`str`, *optional*):
            The language of the transcription text. The corresponding language id token is appended to the start of the
            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
        task (`str`, *optional*):
            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
        predict_timestamps (`bool`, *optional*, defaults to `False`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = WhisperTokenizer

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        normalizer_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        language=None,
        task=None,
        predict_timestamps=False,
        **kwargs,
    ):
        bos_token = (
            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(bos_token, str)
            else bos_token
        )
        eos_token = (
            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(eos_token, str)
            else eos_token
        )
        unk_token = (
            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
            if isinstance(unk_token, str)
            else unk_token
        )

        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        self.add_bos_token = kwargs.pop("add_bos_token", False)

        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        if normalizer_file is not None:
            with open(normalizer_file, encoding="utf-8") as vocab_handle:
                self.english_spelling_normalizer = json.load(vocab_handle)
        else:
            self.english_spelling_normalizer = None

        self.add_prefix_space = add_prefix_space
        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")

        self.language = language
        self.task = task
        self.predict_timestamps = predict_timestamps

    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._batch_encode_plus(*args, **kwargs)

    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)

        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._encode_plus(*args, **kwargs)

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
        """
        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        """
        timestamp_begin = self.all_special_ids[-1] + 1
        outputs = [[]]

        cur_max_timestamp = 0.0
        prev_segments_len = 0.0

        for token in token_ids:
            if token >= timestamp_begin:
                timestamp = float((token - timestamp_begin) * time_precision)

                if timestamp < cur_max_timestamp:
                    # next segment has started
                    prev_segments_len += cur_max_timestamp

                cur_max_timestamp = timestamp

                outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
                outputs.append([])
            else:
                outputs[-1].append(token)
        outputs = [
            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
        ]
        return "".join(outputs)

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
    def _compute_offsets(self, token_ids, time_precision=0.02):
        """
        Compute offsets for a given tokenized input

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        offsets = []
        # ensure torch tensor of token ids is placed on cpu
        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
            token_ids = token_ids.cpu()
        token_ids = np.array(token_ids)
        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
            raise ValueError("Can only process a single input at a time")
        timestamp_begin = self.all_special_ids[-1] + 1
        timestamp_tokens = token_ids >= timestamp_begin

        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
            # either there are no timestamps or there are no consecutive ones
            return []
        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
            # we add the final timestamp if it is not already in the list
            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)

        last_slice = np.where(timestamp_tokens)[0][0]
        for current_slice in consecutive:
            sliced_tokens = token_ids[last_slice:current_slice]
            if len(sliced_tokens) > 1:
                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
                # strip timestamp tokens from the text output
                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
                text = self._decode(sliced_tokens)
                text = self._filter_timestamp_ids(text)
                offsets.append(
                    {
                        "text": text,
                        "timestamp": (
                            start_timestamp_position * time_precision,
                            end_timestamp_position * time_precision,
                        ),
                    }
                )
            last_slice = current_slice

        return offsets

    @lru_cache
    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
    def timestamp_ids(self, time_precision=0.02):
        """
        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

        Args:
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids
    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
        """
        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
                removed.
        """
        if skip_special_tokens:
            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)

        return token_ids

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids
    def _filter_timestamp_ids(self, token_ids):
        return re.sub(self.timestamp_pat, "", token_ids)

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode
    def decode(
        self,
        token_ids,
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        output_offsets: bool = False,
        time_precision: float = 0.02,
        decode_with_timestamps: bool = False,
        normalize: bool = False,
        basic_normalize: bool = False,
        remove_diacritics: bool = False,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            output_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                timestamps.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
                Whether or not to decode with timestamps included in the raw text.
            normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
                target text is in English. Otherwise, the basic text normalizer should be applied.
            basic_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
                target text.
            remove_diacritics (`bool`, *optional*, defaults to `False`):
                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
                destroy information in the decoded text, hence it should be used with caution.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.
        Returns:
            `str`: The decoded sentence.
        """
        filtered_ids = self._preprocess_token_ids(
            token_ids,
            skip_special_tokens=skip_special_tokens,
        )

        text = super().decode(
            filtered_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            normalize=normalize,
            basic_normalize=basic_normalize,
            remove_diacritics=remove_diacritics,
            **kwargs,
        )
        if decode_with_timestamps:
            # legacy method to decode timestamps when not included in the tokenizer vocabulary
            text = self._decode_with_timestamps(
                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
            )
        else:
            text = self._filter_timestamp_ids(text)

        # retrieve offsets
        if output_offsets:
            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
            return {"text": text, "offsets": offsets}
        return text

    def _decode(
        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
    ) -> str:
        text = super()._decode(*args, **kwargs)

        if normalize:
            clean_text = self._normalize(text)
            return clean_text
        elif basic_normalize:
            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
            return clean_text
        else:
            return text

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize
    def _normalize(self, text):
        warnings.warn(
            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
        )
        return self.normalize(text)

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize
    def _basic_normalize(self, text, remove_diacritics=False):
        warnings.warn(
            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
        )
        return self.basic_normalize(text, remove_diacritics=remove_diacritics)

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
    def normalize(self, text):
        """
        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
        english text.
        """
        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
        return normalizer(text)

    @staticmethod
    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
    def basic_normalize(text, remove_diacritics=False):
        """
        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
        multilingual text.
        """
        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
        return normalizer(text)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)

        normalizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
        )

        if self.english_spelling_normalizer is not None:
            with open(normalizer_file, "w", encoding="utf-8") as f:
                f.write(
                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                )

        return tuple(files) + (normalizer_file,)

    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
        """
        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
        update the prefix tokens as required when fine-tuning. Example:

        ```python
        >>> # instantiate the tokenizer and set the prefix token to Spanish
        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
        >>> # now switch the prefix token from Spanish to French
        >>> tokenizer.set_prefix_tokens(language="french")
        ```

        Args:
            language (`str`, *optional*, defaults to `None`):
                The language of the transcription text.
            task (`str`, *optional*, defaults to `None`):
                Task identifier to append at the start of sequence (if any).
            predict_timestamps (`bool`, *optional*, defaults to `None`):
                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
        """
        self.language = language if language is not None else self.language
        self.task = task if task is not None else self.task
        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps

        prefix_token_ids = self.prefix_tokens
        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
        eos = self.eos_token
        eos_token_id = self.eos_token_id
        prefix_template = " ".join([f"{token}:0" for token in prefixes])
        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{prefix_template} $A:0 {eos}:0",
            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
            special_tokens=[
                (eos, eos_token_id),
                *zip(prefixes, prefix_token_ids),
            ],
        )

    @property
    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
    def prefix_tokens(self) -> List[int]:
        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
        langs = tuple(LANGUAGES.keys())

        if self.language is not None:
            self.language = self.language.lower()
            if self.language in TO_LANGUAGE_CODE:
                language_id = TO_LANGUAGE_CODE[self.language]
            elif self.language in TO_LANGUAGE_CODE.values():
                language_id = self.language
            else:
                is_language_code = len(self.language) == 2
                raise ValueError(
                    f"Unsupported language: {self.language}. Language should be one of:"
                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                )

        if self.task is not None:
            if self.task not in TASK_IDS:
                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")

        bos_sequence = [bos_token_id]
        if self.language is not None:
            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
        if self.task is not None:
            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
        if not self.predict_timestamps:
            bos_sequence.append(notimestamps_token_id)
        return bos_sequence

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
        # We don't expect to process pairs, but leave the pair logic for API consistency
        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1]
        if token_ids_1 is None:
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
        # we don't want to force the bos token at position 1, as this is the starting token
        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
        # to get the forced tokens
        forced_tokens = self.prefix_tokens[1:]
        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
        return forced_decoder_ids

    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
        return _decode_asr(
            self,
            model_outputs,
            return_timestamps=return_timestamps,
            return_language=return_language,
            time_precision=time_precision,
        )

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
    def get_prompt_ids(self, text: str, return_tensors="np"):
        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)

        # Check for special tokens
        prompt_text_ids = batch_encoding["input_ids"][1:]
        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
        if special_token_id is not None:
            token = self.convert_ids_to_tokens(special_token_id)
            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")

        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
        return batch_encoding["input_ids"]

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
    def _strip_prompt(self, token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
        if not isinstance(token_ids, list):
            token_ids = self._convert_to_list(token_ids)

        # handle case of empty token_ids for decoding with timestamps.
        # at this point token_ids is a list, so it is safe to use if not check.
        if not token_ids:
            return token_ids

        has_prompt = token_ids[0] == prompt_token_id
        if has_prompt:
            if decoder_start_token_id in token_ids:
                return token_ids[token_ids.index(decoder_start_token_id) :]
            else:
                return []

        return token_ids

    @staticmethod
    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._convert_to_list
    def _convert_to_list(token_ids):
        # convert type to ndarray if necessary
        if hasattr(token_ids, "numpy"):
            if "torch" in str(type(token_ids)):
                token_ids = token_ids.cpu().numpy()
            elif "tensorflow" in str(type(token_ids)):
                token_ids = token_ids.numpy()
        # now the token ids are either a numpy array, or a list of lists
        if isinstance(token_ids, np.ndarray):
            token_ids = token_ids.tolist()
        return token_ids

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.basic_normalize(text, remove_diacritics=False)` `staticmethod` ¶

Normalize a given string using the BasicTextNormalizer class, which preforms commons transformation on multilingual text.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

@staticmethod
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
def basic_normalize(text, remove_diacritics=False):
    """
    Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
    multilingual text.
    """
    normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
    return normalizer(text)

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)` ¶

Build model inputs from a sequence by appending eos_token_id.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
    """Build model inputs from a sequence by appending eos_token_id."""
    if token_ids_1 is None:
        return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
    # We don't expect to process pairs, but leave the pair logic for API consistency
    return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.decode(token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, output_offsets=False, time_precision=0.02, decode_with_timestamps=False, normalize=False, basic_normalize=False, remove_diacritics=False, **kwargs)` ¶

Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.

Similar to doing self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids)).

PARAMETER	DESCRIPTION
`token_ids`	List of tokenized input ids. Can be obtained using the `__call__` method. TYPE: `Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`
`skip_special_tokens`	Whether or not to remove special tokens in the decoding. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`clean_up_tokenization_spaces`	Whether or not to clean up the tokenization spaces. If `None`, will default to `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). TYPE: `bool`, optional DEFAULT: `None`
`output_offsets`	Whether or not to output the offsets of the tokens. This should only be set if the model predicted timestamps. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`time_precision`	The time ratio to convert from token to time. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`
`decode_with_timestamps`	Whether or not to decode with timestamps included in the raw text. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`normalize`	Whether or not to apply the English text normalizer to the decoded text. Only applicable when the target text is in English. Otherwise, the basic text normalizer should be applied. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`basic_normalize`	Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual target text. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`remove_diacritics`	Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may destroy information in the decoded text, hence it should be used with caution. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`kwargs`	Will be passed to the underlying model specific decode method. TYPE: `additional keyword arguments, optional` DEFAULT: `{}`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def decode(
    self,
    token_ids,
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = None,
    output_offsets: bool = False,
    time_precision: float = 0.02,
    decode_with_timestamps: bool = False,
    normalize: bool = False,
    basic_normalize: bool = False,
    remove_diacritics: bool = False,
    **kwargs,
) -> str:
    """
    Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
    tokens and clean up tokenization spaces.

    Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

    Args:
        token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
            List of tokenized input ids. Can be obtained using the `__call__` method.
        skip_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to remove special tokens in the decoding.
        clean_up_tokenization_spaces (`bool`, *optional*):
            Whether or not to clean up the tokenization spaces. If `None`, will default to
            `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
        output_offsets (`bool`, *optional*, defaults to `False`):
            Whether or not to output the offsets of the tokens. This should only be set if the model predicted
            timestamps.
        time_precision (`float`, *optional*, defaults to 0.02):
            The time ratio to convert from token to time.
        decode_with_timestamps (`bool`, *optional*, defaults to `False`):
            Whether or not to decode with timestamps included in the raw text.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
            target text is in English. Otherwise, the basic text normalizer should be applied.
        basic_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
            target text.
        remove_diacritics (`bool`, *optional*, defaults to `False`):
            Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
            destroy information in the decoded text, hence it should be used with caution.
        kwargs (additional keyword arguments, *optional*):
            Will be passed to the underlying model specific decode method.
    Returns:
        `str`: The decoded sentence.
    """
    filtered_ids = self._preprocess_token_ids(
        token_ids,
        skip_special_tokens=skip_special_tokens,
    )

    text = super().decode(
        filtered_ids,
        skip_special_tokens=skip_special_tokens,
        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        normalize=normalize,
        basic_normalize=basic_normalize,
        remove_diacritics=remove_diacritics,
        **kwargs,
    )
    if decode_with_timestamps:
        # legacy method to decode timestamps when not included in the tokenizer vocabulary
        text = self._decode_with_timestamps(
            filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
        )
    else:
        text = self._filter_timestamp_ids(text)

    # retrieve offsets
    if output_offsets:
        offsets = self._compute_offsets(token_ids, time_precision=time_precision)
        return {"text": text, "offsets": offsets}
    return text

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_prompt_ids(text, return_tensors='np')` ¶

Converts prompt text to IDs that can be passed to [~WhisperForConditionalGeneration.generate].

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def get_prompt_ids(self, text: str, return_tensors="np"):
    """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
    batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)

    # Check for special tokens
    prompt_text_ids = batch_encoding["input_ids"][1:]
    special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
    if special_token_id is not None:
        token = self.convert_ids_to_tokens(special_token_id)
        raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")

    batch_encoding.convert_to_tensors(tensor_type=return_tensors)
    return batch_encoding["input_ids"]

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)` ¶

Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer prepare_for_model method.

PARAMETER	DESCRIPTION
`token_ids_0`	List of IDs. TYPE: `List[int]`
`token_ids_1`	Optional second list of IDs for sequence pairs. TYPE: `List[int]`, optional DEFAULT: `None`
`already_has_special_tokens`	Whether or not the token list is already formatted with special tokens for the model. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`

RETURNS	DESCRIPTION
`List[int]`	`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
    """
    Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer `prepare_for_model` method.

    Args:
        token_ids_0 (`List[int]`):
            List of IDs.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.
        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not the token list is already formatted with special tokens for the model.

    Returns:
        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """

    if already_has_special_tokens:
        return super().get_special_tokens_mask(
            token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
        )

    prefix_ones = [1] * len(self.prefix_tokens)
    suffix_ones = [1]
    if token_ids_1 is None:
        return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
    return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.normalize(text)` ¶

Normalize a given string using the EnglishTextNormalizer class, which preforms commons transformation on english text.

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def normalize(self, text):
    """
    Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
    english text.
    """
    normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
    return normalizer(text)

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.set_prefix_tokens(language=None, task=None, predict_timestamps=None)` ¶

Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to update the prefix tokens as required when fine-tuning. Example:

>>> # instantiate the tokenizer and set the prefix token to Spanish
>>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
>>> # now switch the prefix token from Spanish to French
>>> tokenizer.set_prefix_tokens(language="french")

PARAMETER	DESCRIPTION
`language`	The language of the transcription text. TYPE: `str`, optional, defaults to `None` DEFAULT: `None`
`task`	Task identifier to append at the start of sequence (if any). TYPE: `str`, optional, defaults to `None` DEFAULT: `None`
`predict_timestamps`	Whether to omit the `<\|notimestamps\|>` token at the start of the sequence. TYPE: `bool`, optional, defaults to `None` DEFAULT: `None`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
    """
    Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
    update the prefix tokens as required when fine-tuning. Example:

    ```python
    >>> # instantiate the tokenizer and set the prefix token to Spanish
    >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
    >>> # now switch the prefix token from Spanish to French
    >>> tokenizer.set_prefix_tokens(language="french")
    ```

    Args:
        language (`str`, *optional*, defaults to `None`):
            The language of the transcription text.
        task (`str`, *optional*, defaults to `None`):
            Task identifier to append at the start of sequence (if any).
        predict_timestamps (`bool`, *optional*, defaults to `None`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    """
    self.language = language if language is not None else self.language
    self.task = task if task is not None else self.task
    self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps

    prefix_token_ids = self.prefix_tokens
    prefixes = self.convert_ids_to_tokens(prefix_token_ids)
    eos = self.eos_token
    eos_token_id = self.eos_token_id
    prefix_template = " ".join([f"{token}:0" for token in prefixes])
    self.backend_tokenizer.post_processor = processors.TemplateProcessing(
        single=f"{prefix_template} $A:0 {eos}:0",
        pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
        special_tokens=[
            (eos, eos_token_id),
            *zip(prefixes, prefix_token_ids),
        ],
    )

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.timestamp_ids(time_precision=0.02)` `cached` ¶

Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

PARAMETER	DESCRIPTION
`time_precision`	The time ratio to convert from token to time. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`

Source code in mindnlp\transformers\models\whisper\tokenization_whisper_fast.py

@lru_cache
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
def timestamp_ids(self, time_precision=0.02):
    """
    Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

    Args:
        time_precision (`float`, *optional*, defaults to 0.02):
            The time ratio to convert from token to time.
    """
    return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])

whisper

mindnlp.transformers.models.whisper.modeling_whisper ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention.forward(hidden_states, key_value_states=None, past_key_value=None, attention_mask=None, layer_head_mask=None, output_attentions=False, cache_position=None) ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoder ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderLayer ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderWrapper ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder.forward(input_features, attention_mask=None, head_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None) ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer.forward(hidden_states, attention_mask, layer_head_mask, output_attentions=False) ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.forward(input_features=None, head_mask=None, encoder_outputs=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None) ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.freeze_encoder() ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForCausalLM ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration.freeze_encoder() ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel.freeze_encoder() ¶

mindnlp.transformers.models.whisper.modeling_whisper.WhisperPreTrainedModel ¶

mindnlp.transformers.models.whisper.modeling_whisper.shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id) ¶

mindnlp.transformers.models.whisper.modeling_whisper.sinusoids(length, channels, max_timescale=10000) ¶

mindnlp.transformers.models.whisper.tokenization_whisper ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize(text, remove_diacritics=False) staticmethod ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None) ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.convert_tokens_to_string(tokens) ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids(text, return_tensors='np') ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False) ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize(text) ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.set_prefix_tokens(language=None, task=None, predict_timestamps=None) ¶

mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids(time_precision=0.02) cached ¶

mindnlp.transformers.models.whisper.tokenization_whisper.bytes_to_unicode() ¶

mindnlp.transformers.models.whisper.tokenization_whisper.get_pairs(word) ¶

mindnlp.transformers.models.whisper.processing_whisper ¶

mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor ¶

mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.__call__(*args, **kwargs) ¶

mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode(*args, **kwargs) ¶

mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.decode(*args, **kwargs) ¶

mindnlp.transformers.models.whisper.configuration_whisper ¶

mindnlp.transformers.models.whisper.configuration_whisper.WhisperConfig ¶

mindnlp.transformers.models.whisper.feature_extraction_whisper ¶

mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor ¶

mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.zero_mean_unit_var_norm(input_values, attention_mask, padding_value=0.0) staticmethod ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.basic_normalize(text, remove_diacritics=False) staticmethod ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None) ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_prompt_ids(text, return_tensors='np') ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False) ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.normalize(text) ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.set_prefix_tokens(language=None, task=None, predict_timestamps=None) ¶

mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.timestamp_ids(time_precision=0.02) cached ¶

`mindnlp.transformers.models.whisper.modeling_whisper` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperAttention.forward(hidden_states, key_value_states=None, past_key_value=None, attention_mask=None, layer_head_mask=None, output_attentions=False, cache_position=None)` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoder` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderLayer` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperDecoderWrapper` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoder.forward(input_features, attention_mask=None, head_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperEncoderLayer.forward(hidden_states, attention_mask, layer_head_mask, output_attentions=False)` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.forward(input_features=None, head_mask=None, encoder_outputs=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForAudioClassification.freeze_encoder()` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForCausalLM` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration.freeze_encoder()` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperModel.freeze_encoder()` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.WhisperPreTrainedModel` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id)` ¶

`mindnlp.transformers.models.whisper.modeling_whisper.sinusoids(length, channels, max_timescale=10000)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize(text, remove_diacritics=False)` `staticmethod` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.convert_tokens_to_string(tokens)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids(text, return_tensors='np')` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize(text)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.set_prefix_tokens(language=None, task=None, predict_timestamps=None)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids(time_precision=0.02)` `cached` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.bytes_to_unicode()` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper.get_pairs(word)` ¶

`mindnlp.transformers.models.whisper.processing_whisper` ¶

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor` ¶

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.call(*args, **kwargs)` ¶

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode(*args, **kwargs)` ¶

`mindnlp.transformers.models.whisper.processing_whisper.WhisperProcessor.decode(*args, **kwargs)` ¶

`mindnlp.transformers.models.whisper.configuration_whisper` ¶

`mindnlp.transformers.models.whisper.configuration_whisper.WhisperConfig` ¶

`mindnlp.transformers.models.whisper.feature_extraction_whisper` ¶

`mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor` ¶

`mindnlp.transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.zero_mean_unit_var_norm(input_values, attention_mask, padding_value=0.0)` `staticmethod` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.basic_normalize(text, remove_diacritics=False)` `staticmethod` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_prompt_ids(text, return_tensors='np')` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.normalize(text)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.set_prefix_tokens(language=None, task=None, predict_timestamps=None)` ¶

`mindnlp.transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast.timestamp_ids(time_precision=0.02)` `cached` ¶