megatron_bert

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert` ¶

MindSpore MegatronBERT model.

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertEmbeddings` ¶

Bases: Module

Construct the embeddings from word, position and token_type embeddings.

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file

        # In Megatron, layer-norm is applied after the 1st dropout.
        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids", ops.arange(config.max_position_embeddings).broadcast_to((1, -1)), persistent=False
        )
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        past_key_values_length: int = 0,
    ) -> mindspore.Tensor:
        if input_ids is not None:
            input_shape = input_ids.shape
        else:
            input_shape = inputs_embeds.shape[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if token_type_ids is None:
            token_type_ids = ops.zeros(input_shape, dtype=mindspore.int64)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
        # embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForCausalLM` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder"]

    def __init__(self, config):
        super().__init__(config)

        if not config.is_decoder:
            logger.warning("If you want to use `MegatronBertForCausalLM` as a standalone, add `is_decoder=True.`")

        self.bert = MegatronBertModel(config, add_pooling_layer=False)
        self.cls = MegatronBertOnlyMLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings
        self.cls.predictions.bias = new_embeddings.bias

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        encoder_hidden_states  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`tuple(tuple(mindspore.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MegatronBertForCausalLM, MegatronBertConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
        >>> model = MegatronBertForCausalLM.from_pretrained("nvidia/megatron-bert-cased-345m", is_decoder=True)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
            use_cache = False

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        lm_loss = None
        if labels is not None:
            # we are doing next-token prediction; shift prediction scores and input ids by one
            shifted_prediction_scores = prediction_scores[:, :-1, :]
            labels = labels[:, 1:]
            loss_fct = CrossEntropyLoss()
            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((lm_loss,) + output) if lm_loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=lm_loss,
            logits=prediction_scores,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # cut decoder_input_ids if past_key_values is used
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),
            )
        return reordered_past

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForCausalLM.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

encoder_hidden_states (mindspore.Tensor of shape (batch_size, sequence_length, hidden_size), optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (mindspore.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in [0, 1]:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in [-100, 0, ..., config.vocab_size] (see input_ids docstring) Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels n [0, ..., config.vocab_size] past_key_values (tuple(tuple(mindspore.Tensor)) of length config.n_layers with each tuple having 4 tensors of shape (batch_size, num_heads, sequence_length - 1, embed_size_per_head)): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.

use_cache (bool, optional): If set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).

Returns:

Example:

>>> from transformers import AutoTokenizer, MegatronBertForCausalLM, MegatronBertConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
>>> model = MegatronBertForCausalLM.from_pretrained("nvidia/megatron-bert-cased-345m", is_decoder=True)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
    r"""
    encoder_hidden_states  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
        the model is configured as a decoder.
    encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
        the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
        `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
        ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
    past_key_values (`tuple(tuple(mindspore.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
        Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
        don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
        `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
        `past_key_values`).

    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, MegatronBertForCausalLM, MegatronBertConfig
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
    >>> model = MegatronBertForCausalLM.from_pretrained("nvidia/megatron-bert-cased-345m", is_decoder=True)

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.logits
    ```"""

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    if labels is not None:
        use_cache = False

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        past_key_values=past_key_values,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]
    prediction_scores = self.cls(sequence_output)

    lm_loss = None
    if labels is not None:
        # we are doing next-token prediction; shift prediction scores and input ids by one
        shifted_prediction_scores = prediction_scores[:, :-1, :]
        labels = labels[:, 1:]
        loss_fct = CrossEntropyLoss()
        lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        output = (prediction_scores,) + outputs[2:]
        return ((lm_loss,) + output) if lm_loss is not None else output

    return CausalLMOutputWithCrossAttentions(
        loss=lm_loss,
        logits=prediction_scores,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        cross_attentions=outputs.cross_attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMaskedLM` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder"]

    def __init__(self, config):
        super().__init__(config)

        if config.is_decoder:
            logger.warning(
                "If you want to use `MegatronBertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        self.bert = MegatronBertModel(config, add_pooling_layer=False)
        self.cls = MegatronBertOnlyMLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings
        self.cls.predictions.bias = new_embeddings.bias

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")
        attention_mask = ops.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        dummy_token = ops.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=mindspore.int64
        )
        input_ids = ops.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMaskedLM.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the masked language modeling loss. Indices should be in [-100, 0, ..., config.vocab_size] (see input_ids docstring) Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size]

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    """

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]
    prediction_scores = self.cls(sequence_output)

    masked_lm_loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()  # -100 index = padding token
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        output = (prediction_scores,) + outputs[2:]
        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

    return MaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMultipleChoice` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = MegatronBertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.shape[-1]) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.shape[-1]) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.shape[-1]) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])
            if inputs_embeds is not None
            else None
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMultipleChoice.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the multiple choice classification loss. Indices should be in [0, ..., num_choices-1] where num_choices is the size of the second dimension of the input tensors. (See input_ids above)

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, MultipleChoiceModelOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
        num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
        `input_ids` above)
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

    input_ids = input_ids.view(-1, input_ids.shape[-1]) if input_ids is not None else None
    attention_mask = attention_mask.view(-1, attention_mask.shape[-1]) if attention_mask is not None else None
    token_type_ids = token_type_ids.view(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
    position_ids = position_ids.view(-1, position_ids.shape[-1]) if position_ids is not None else None
    inputs_embeds = (
        inputs_embeds.view(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])
        if inputs_embeds is not None
        else None
    )

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    reshaped_logits = logits.view(-1, num_choices)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(reshaped_logits, labels)

    if not return_dict:
        output = (reshaped_logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return MultipleChoiceModelOutput(
        loss=loss,
        logits=reshaped_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForNextSentencePrediction` ¶

Bases: MegatronBertPreTrainedModel

Represents a MegatronBert model for next sentence prediction.

This class inherits from the MegatronBertPreTrainedModel and provides next sentence prediction functionality using the Megatron BERT model.

The class forwardor initializes the MegatronBertForNextSentencePrediction model with the given configuration.

The forward method takes input tensors and computes the next sentence prediction loss. It returns the next sentence predictor output.

PARAMETER	DESCRIPTION
`input_ids`	The input tensor containing the indices of input sequence tokens in the vocabulary. Defaults to None. TYPE: `Optional[Tensor]`
`attention_mask`	The input tensor containing indices specifying which tokens should be attended to. Defaults to None. TYPE: `Optional[Tensor]`
`token_type_ids`	The input tensor containing the segment token indices to differentiate the sequences. Defaults to None. TYPE: `Optional[Tensor]`
`position_ids`	The input tensor containing the position indices of each input token. Defaults to None. TYPE: `Optional[Tensor]`
`head_mask`	The input tensor containing the mask for the attention heads. Defaults to None. TYPE: `Optional[Tensor]`
`inputs_embeds`	The input tensor containing the embedded inputs. Defaults to None. TYPE: `Optional[Tensor]`
`labels`	The tensor containing the labels for computing the next sequence prediction loss. Defaults to None. TYPE: `Optional[Tensor]`
`output_attentions`	Whether to return attentions. Defaults to None. TYPE: `Optional[bool]`
`output_hidden_states`	Whether to return hidden states. Defaults to None. TYPE: `Optional[bool]`
`return_dict`	Whether to return a dictionary. Defaults to None. TYPE: `Optional[bool]`

RETURNS	DESCRIPTION
	Union[Tuple, NextSentencePredictorOutput]: A tuple containing the next sentence prediction loss and the next sentence predictor output.

RAISES	DESCRIPTION
`FutureWarning`	If the `next_sentence_label` argument is used, as it is deprecated.

Example

>>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
...
>>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
>>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")
...
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="ms")
...
>>> outputs = model(**encoding, labels=mindspore.Tensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
    """
    Represents a MegatronBert model for next sentence prediction.

    This class inherits from the MegatronBertPreTrainedModel and provides next sentence prediction functionality
    using the Megatron BERT model.

    The class forwardor initializes the MegatronBertForNextSentencePrediction model with the given configuration.

    The `forward` method takes input tensors and computes the next sentence prediction loss.
    It returns the next sentence predictor output.

    Args:
        input_ids (Optional[mindspore.Tensor], optional): The input tensor containing the indices of input sequence
            tokens in the vocabulary. Defaults to None.
        attention_mask (Optional[mindspore.Tensor], optional): The input tensor containing indices specifying which
            tokens should be attended to. Defaults to None.
        token_type_ids (Optional[mindspore.Tensor], optional): The input tensor containing the segment token indices
            to differentiate the sequences. Defaults to None.
        position_ids (Optional[mindspore.Tensor], optional): The input tensor containing the position indices of
            each input token. Defaults to None.
        head_mask (Optional[mindspore.Tensor], optional): The input tensor containing the mask for the attention heads.
            Defaults to None.
        inputs_embeds (Optional[mindspore.Tensor], optional): The input tensor containing the embedded inputs.
            Defaults to None.
        labels (Optional[mindspore.Tensor], optional): The tensor containing the labels for computing the next sequence
            prediction loss. Defaults to None.
        output_attentions (Optional[bool], optional): Whether to return attentions. Defaults to None.
        output_hidden_states (Optional[bool], optional): Whether to return hidden states. Defaults to None.
        return_dict (Optional[bool], optional): Whether to return a dictionary. Defaults to None.

    Returns:
        Union[Tuple, NextSentencePredictorOutput]: A tuple containing the next sentence prediction loss and the
            next sentence predictor output.

    Raises:
        FutureWarning: If the `next_sentence_label` argument is used, as it is deprecated.

    Example:
        ```python
        >>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
        ...
        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
        >>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")
        ...
        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="ms")
        ...
        >>> outputs = model(**encoding, labels=mindspore.Tensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
    """
    def __init__(self, config):
        super().__init__(config)

        self.bert = MegatronBertModel(config)
        self.cls = MegatronBertOnlyNSPHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, NextSentencePredictorOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:

        Example:
        ```python
        >>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
        >>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=mindspore.Tensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```"""

        if "next_sentence_label" in kwargs:
            warnings.warn(
                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
                " `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("next_sentence_label")

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        seq_relationship_scores = self.cls(pooled_output)

        next_sentence_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))

        if not return_dict:
            output = (seq_relationship_scores,) + outputs[2:]
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output

        return NextSentencePredictorOutput(
            loss=next_sentence_loss,
            logits=seq_relationship_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForNextSentencePrediction.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, **kwargs)` ¶

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see input_ids docstring). Indices should be in [0, 1]:

- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.

Returns:

Example:

>>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
>>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

>>> outputs = model(**encoding, labels=mindspore.Tensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    **kwargs,
) -> Union[Tuple, NextSentencePredictorOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
        (see `input_ids` docstring). Indices should be in `[0, 1]`:

        - 0 indicates sequence B is a continuation of sequence A,
        - 1 indicates sequence B is a random sequence.

    Returns:

    Example:
    ```python
    >>> from transformers import AutoTokenizer, MegatronBertForNextSentencePrediction
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
    >>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

    >>> outputs = model(**encoding, labels=mindspore.Tensor([1]))
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```"""

    if "next_sentence_label" in kwargs:
        warnings.warn(
            "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
            " `labels` instead.",
            FutureWarning,
        )
        labels = kwargs.pop("next_sentence_label")

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    pooled_output = outputs[1]

    seq_relationship_scores = self.cls(pooled_output)

    next_sentence_loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))

    if not return_dict:
        output = (seq_relationship_scores,) + outputs[2:]
        return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output

    return NextSentencePredictorOutput(
        loss=next_sentence_loss,
        logits=seq_relationship_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTraining` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder"]

    def __init__(self, config, add_binary_head=True):
        super().__init__(config)

        self.bert = MegatronBertModel(config)
        self.cls = MegatronBertPreTrainingHeads(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings
        self.cls.predictions.bias = new_embeddings.bias

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        next_sentence_label: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MegatronBertForPreTrainingOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MegatronBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
        >>> model = MegatronBertForPreTraining.from_pretrained("nvidia/megatron-bert-cased-345m")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return MegatronBertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTraining.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, next_sentence_label=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the masked language modeling loss. Indices should be in [-100, 0, ..., config.vocab_size] (see input_ids docstring) Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size] next_sentence_label (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see input_ids docstring) Indices should be in [0, 1]:

- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.

kwargs (Dict[str, any], optional, defaults to {}): Used to hide legacy arguments that have been deprecated.

Returns:

Example:

>>> from transformers import AutoTokenizer, MegatronBertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
>>> model = MegatronBertForPreTraining.from_pretrained("nvidia/megatron-bert-cased-345m")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    next_sentence_label: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, MegatronBertForPreTrainingOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    next_sentence_label (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
        (see `input_ids` docstring) Indices should be in `[0, 1]`:

        - 0 indicates sequence B is a continuation of sequence A,
        - 1 indicates sequence B is a random sequence.
    kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
        Used to hide legacy arguments that have been deprecated.

    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, MegatronBertForPreTraining
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
    >>> model = MegatronBertForPreTraining.from_pretrained("nvidia/megatron-bert-cased-345m")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```"""

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output, pooled_output = outputs[:2]
    prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

    total_loss = None
    if labels is not None and next_sentence_label is not None:
        loss_fct = CrossEntropyLoss()
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
        total_loss = masked_lm_loss + next_sentence_loss

    if not return_dict:
        output = (prediction_scores, seq_relationship_score) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

    return MegatronBertForPreTrainingOutput(
        loss=total_loss,
        prediction_logits=prediction_scores,
        seq_relationship_logits=seq_relationship_score,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTrainingOutput` `dataclass` ¶

Bases: ModelOutput

Output type of [MegatronBertForPreTraining].

PARAMETER	DESCRIPTION
`loss`	Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. TYPE: optional, returned when `labels` is provided, `mindspore.Tensor` of shape `(1,)` DEFAULT: `None`
`prediction_logits`	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)` DEFAULT: `None`
`seq_relationship_logits`	Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). TYPE: `mindspore.Tensor` of shape `(batch_size, 2)` DEFAULT: `None`
`hidden_states`	Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` DEFAULT: `None`
`attentions`	Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True` DEFAULT: `None`

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

@dataclass
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
class MegatronBertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`MegatronBertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `mindspore.Tensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`mindspore.Tensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[mindspore.Tensor] = None
    prediction_logits: mindspore.Tensor = None
    seq_relationship_logits: mindspore.Tensor = None
    hidden_states: Optional[Tuple[mindspore.Tensor]] = None
    attentions: Optional[Tuple[mindspore.Tensor]] = None

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForQuestionAnswering` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = MegatronBertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        start_positions: Optional[mindspore.Tensor] = None,
        end_positions: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = ops.split(logits, 1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.shape) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.shape) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.shape[1]
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForQuestionAnswering.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

start_positions (mindspore.Tensor of shape (batch_size,), optional): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss. end_positions (mindspore.Tensor of shape (batch_size,), optional): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss.

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    start_positions: Optional[mindspore.Tensor] = None,
    end_positions: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
    r"""
    start_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the start of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    end_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the end of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]

    logits = self.qa_outputs(sequence_output)
    start_logits, end_logits = ops.split(logits, 1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)

    total_loss = None
    if start_positions is not None and end_positions is not None:
        # If we are on multi-GPU, split add a dimension
        if len(start_positions.shape) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.shape) > 1:
            end_positions = end_positions.squeeze(-1)
        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.shape[1]
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2

    if not return_dict:
        output = (start_logits, end_logits) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

    return QuestionAnsweringModelOutput(
        loss=total_loss,
        start_logits=start_logits,
        end_logits=end_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForSequenceClassification` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = MegatronBertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForSequenceClassification.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)

    loss = None
    if labels is not None:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForTokenClassification` ¶

Bases: MegatronBertPreTrainedModel

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = MegatronBertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForTokenClassification.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the token classification loss. Indices should be in [0, ..., config.num_labels - 1].

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]

    sequence_output = self.dropout(sequence_output)
    logits = self.classifier(sequence_output)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertModel` ¶

Bases: MegatronBertPreTrainedModel

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in Attention is all you need by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

To behave as an decoder the model needs to be initialized with the is_decoder argument of the configuration set to True. To be used in a Seq2Seq model, the model needs to initialized with both is_decoder argument and add_cross_attention set to True; an encoder_hidden_states is then expected as an input to the forward pass.

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertModel(MegatronBertPreTrainedModel):
    """

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = MegatronBertEmbeddings(config)
        self.encoder = MegatronBertEncoder(config)

        self.pooler = MegatronBertPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(mindspore.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if self.config.is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
        else:
            use_cache = False

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.shape
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if attention_mask is None:
            attention_mask = ops.ones(((batch_size, seq_length + past_key_values_length)))
        if token_type_ids is None:
            token_type_ids = ops.zeros(input_shape, dtype=mindspore.int64)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: mindspore.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = ops.ones(encoder_hidden_shape)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length,
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertModel.forward(input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

encoder_hidden_states (mindspore.Tensor of shape (batch_size, sequence_length, hidden_size), optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (mindspore.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in [0, 1]:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

past_key_values (tuple(tuple(mindspore.Tensor)) of length config.n_layers with each tuple having 4 tensors of shape (batch_size, num_heads, sequence_length - 1, embed_size_per_head)): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.

use_cache (bool, optional): If set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
    r"""
    encoder_hidden_states  (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
        the model is configured as a decoder.
    encoder_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
        the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
    past_key_values (`tuple(tuple(mindspore.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
        Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
        don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
        `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
        `past_key_values`).
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if self.config.is_decoder:
        use_cache = use_cache if use_cache is not None else self.config.use_cache
    else:
        use_cache = False

    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
    elif input_ids is not None:
        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
        input_shape = input_ids.shape
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.shape[:-1]
    else:
        raise ValueError("You have to specify either input_ids or inputs_embeds")

    batch_size, seq_length = input_shape

    # past_key_values_length
    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

    if attention_mask is None:
        attention_mask = ops.ones(((batch_size, seq_length + past_key_values_length)))
    if token_type_ids is None:
        token_type_ids = ops.zeros(input_shape, dtype=mindspore.int64)

    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    extended_attention_mask: mindspore.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

    # If a 2D or 3D attention mask is provided for the cross-attention
    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
    if self.config.is_decoder and encoder_hidden_states is not None:
        encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
        if encoder_attention_mask is None:
            encoder_attention_mask = ops.ones(encoder_hidden_shape)
        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
    else:
        encoder_extended_attention_mask = None

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x n_heads x N x N
    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

    embedding_output = self.embeddings(
        input_ids=input_ids,
        position_ids=position_ids,
        token_type_ids=token_type_ids,
        inputs_embeds=inputs_embeds,
        past_key_values_length=past_key_values_length,
    )
    encoder_outputs = self.encoder(
        embedding_output,
        attention_mask=extended_attention_mask,
        head_mask=head_mask,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_extended_attention_mask,
        past_key_values=past_key_values,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    sequence_output = encoder_outputs[0]
    pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

    if not return_dict:
        return (sequence_output, pooled_output) + encoder_outputs[1:]

    return BaseModelOutputWithPoolingAndCrossAttentions(
        last_hidden_state=sequence_output,
        pooler_output=pooled_output,
        past_key_values=encoder_outputs.past_key_values,
        hidden_states=encoder_outputs.hidden_states,
        attentions=encoder_outputs.attentions,
        cross_attentions=encoder_outputs.cross_attentions,
    )

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertPreTrainedModel` ¶

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\megatron_bert\modeling_megatron_bert.py

class MegatronBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = MegatronBertConfig
    base_model_prefix = "bert"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            nn.init.zeros_(module.bias)
            nn.init.ones_(module.weight)
        if isinstance(module, nn.Linear) and module.bias is not None:
            nn.init.zeros_(module.bias)

`mindnlp.transformers.models.megatron_bert.configuration_megatron_bert` ¶

MEGATRON_BERT model configuration

`mindnlp.transformers.models.megatron_bert.configuration_megatron_bert.MegatronBertConfig` ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [MegatronBertModel]. It is used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT nvidia/megatron-bert-uncased-345m architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER	DESCRIPTION
`vocab_size`	Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`MegatronBertModel`]. TYPE: `int`, optional, defaults to 29056 DEFAULT: `29056`
`hidden_size`	Dimensionality of the encoder layers and the pooler layer. TYPE: `int`, optional, defaults to 1024 DEFAULT: `1024`
`num_hidden_layers`	Number of hidden layers in the Transformer encoder. TYPE: `int`, optional, defaults to 24 DEFAULT: `24`
`num_attention_heads`	Number of attention heads for each attention layer in the Transformer encoder. TYPE: `int`, optional, defaults to 16 DEFAULT: `16`
`intermediate_size`	Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. TYPE: `int`, optional, defaults to 4096 DEFAULT: `4096`
`hidden_act`	The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported. TYPE: `str` or `Callable`, optional, defaults to `"gelu"` DEFAULT: `'gelu'`
`hidden_dropout_prob`	The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. TYPE: `float`, optional, defaults to 0.1 DEFAULT: `0.1`
`attention_probs_dropout_prob`	The dropout ratio for the attention probabilities. TYPE: `float`, optional, defaults to 0.1 DEFAULT: `0.1`
`max_position_embeddings`	The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). TYPE: `int`, optional, defaults to 512 DEFAULT: `512`
`type_vocab_size`	The vocabulary size of the `token_type_ids` passed when calling [`MegatronBertModel`]. TYPE: `int`, optional, defaults to 2 DEFAULT: `2`
`initializer_range`	The standard deviation of the truncated_normal_initializer for initializing all weight matrices. TYPE: `float`, optional, defaults to 0.02 DEFAULT: `0.02`
`layer_norm_eps`	The epsilon used by the layer normalization layers. TYPE: `float`, optional, defaults to 1e-12 DEFAULT: `1e-12`
`position_embedding_type`	Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to Self-Attention with Relative Position Representations (Shaw et al.). For more information on `"relative_key_query"`, please refer to Method 4 in Improve Transformer Models with Better Relative Position Embeddings (Huang et al.). TYPE: `str`, optional, defaults to `"absolute"` DEFAULT: `'absolute'`
`is_decoder`	Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. TYPE: `bool`, optional, defaults to `False`
`use_cache`	Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`

Example

>>> from transformers import MegatronBertConfig, MegatronBertModel
...
>>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
>>> configuration = MegatronBertConfig()
...
>>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
>>> model = MegatronBertModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in mindnlp\transformers\models\megatron_bert\configuration_megatron_bert.py

class MegatronBertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is used to instantiate a
    MEGATRON_BERT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
    [nvidia/megatron-bert-uncased-345m](https://hf-mirror.com/nvidia/megatron-bert-uncased-345m) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 29056):
            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
            by the `inputs_ids` passed when calling [`MegatronBertModel`].
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        intermediate_size (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (`int`, *optional*, defaults to 2):
            The vocabulary size of the `token_type_ids` passed when calling [`MegatronBertModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
        is_decoder (`bool`, *optional*, defaults to `False`):
            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.

    Example:
        ```python
        >>> from transformers import MegatronBertConfig, MegatronBertModel
        ...
        >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
        >>> configuration = MegatronBertConfig()
        ...
        >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
        >>> model = MegatronBertModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "megatron-bert"

    def __init__(
        self,
        vocab_size=29056,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
        intermediate_size=4096,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
        **kwargs,
    ):
        """
        Initialize a MegatronBertConfig object with the provided parameters.

        Args:
            vocab_size (int): The size of the vocabulary used for tokenization.
            hidden_size (int): The size of the hidden layers in the model.
            num_hidden_layers (int): The number of hidden layers in the model.
            num_attention_heads (int): The number of attention heads in the model.
            intermediate_size (int): The size of the intermediate (feed-forward) layer.
            hidden_act (str): The activation function used in the hidden layers.
            hidden_dropout_prob (float): The dropout probability for the hidden layers.
            attention_probs_dropout_prob (float): The dropout probability for attention probabilities.
            max_position_embeddings (int): The maximum length of input sequences.
            type_vocab_size (int): The size of the token type embeddings.
            initializer_range (float): The range for parameter initializations.
            layer_norm_eps (float): The epsilon value for layer normalization.
            pad_token_id (int): The ID of the padding token.
            position_embedding_type (str): The type of position embeddings used.
            use_cache (bool): Whether to use caching during inference.

        Returns:
            None.

        Raises:
            ValueError: If any argument is invalid or out of range.
        """
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache

`mindnlp.transformers.models.megatron_bert.configuration_megatron_bert.MegatronBertConfig.init(vocab_size=29056, hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096, hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type='absolute', use_cache=True, **kwargs)` ¶

Initialize a MegatronBertConfig object with the provided parameters.

PARAMETER	DESCRIPTION
`vocab_size`	The size of the vocabulary used for tokenization. TYPE: `int` DEFAULT: `29056`
`hidden_size`	The size of the hidden layers in the model. TYPE: `int` DEFAULT: `1024`
`num_hidden_layers`	The number of hidden layers in the model. TYPE: `int` DEFAULT: `24`
`num_attention_heads`	The number of attention heads in the model. TYPE: `int` DEFAULT: `16`
`intermediate_size`	The size of the intermediate (feed-forward) layer. TYPE: `int` DEFAULT: `4096`
`hidden_act`	The activation function used in the hidden layers. TYPE: `str` DEFAULT: `'gelu'`
`hidden_dropout_prob`	The dropout probability for the hidden layers. TYPE: `float` DEFAULT: `0.1`
`attention_probs_dropout_prob`	The dropout probability for attention probabilities. TYPE: `float` DEFAULT: `0.1`
`max_position_embeddings`	The maximum length of input sequences. TYPE: `int` DEFAULT: `512`
`type_vocab_size`	The size of the token type embeddings. TYPE: `int` DEFAULT: `2`
`initializer_range`	The range for parameter initializations. TYPE: `float` DEFAULT: `0.02`
`layer_norm_eps`	The epsilon value for layer normalization. TYPE: `float` DEFAULT: `1e-12`
`pad_token_id`	The ID of the padding token. TYPE: `int` DEFAULT: `0`
`position_embedding_type`	The type of position embeddings used. TYPE: `str` DEFAULT: `'absolute'`
`use_cache`	Whether to use caching during inference. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
	None.

RAISES	DESCRIPTION
`ValueError`	If any argument is invalid or out of range.

Source code in mindnlp\transformers\models\megatron_bert\configuration_megatron_bert.py

def __init__(
    self,
    vocab_size=29056,
    hidden_size=1024,
    num_hidden_layers=24,
    num_attention_heads=16,
    intermediate_size=4096,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    position_embedding_type="absolute",
    use_cache=True,
    **kwargs,
):
    """
    Initialize a MegatronBertConfig object with the provided parameters.

    Args:
        vocab_size (int): The size of the vocabulary used for tokenization.
        hidden_size (int): The size of the hidden layers in the model.
        num_hidden_layers (int): The number of hidden layers in the model.
        num_attention_heads (int): The number of attention heads in the model.
        intermediate_size (int): The size of the intermediate (feed-forward) layer.
        hidden_act (str): The activation function used in the hidden layers.
        hidden_dropout_prob (float): The dropout probability for the hidden layers.
        attention_probs_dropout_prob (float): The dropout probability for attention probabilities.
        max_position_embeddings (int): The maximum length of input sequences.
        type_vocab_size (int): The size of the token type embeddings.
        initializer_range (float): The range for parameter initializations.
        layer_norm_eps (float): The epsilon value for layer normalization.
        pad_token_id (int): The ID of the padding token.
        position_embedding_type (str): The type of position embeddings used.
        use_cache (bool): Whether to use caching during inference.

    Returns:
        None.

    Raises:
        ValueError: If any argument is invalid or out of range.
    """
    super().__init__(pad_token_id=pad_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.position_embedding_type = position_embedding_type
    self.use_cache = use_cache

megatron_bert

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertEmbeddings ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForCausalLM ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMaskedLM ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMultipleChoice ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForNextSentencePrediction ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTraining ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTrainingOutput dataclass ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForQuestionAnswering ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForSequenceClassification ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForTokenClassification ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertModel ¶

mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertPreTrainedModel ¶

mindnlp.transformers.models.megatron_bert.configuration_megatron_bert ¶

mindnlp.transformers.models.megatron_bert.configuration_megatron_bert.MegatronBertConfig ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertEmbeddings` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForCausalLM` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMaskedLM` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForMultipleChoice` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForNextSentencePrediction` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTraining` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForPreTrainingOutput` `dataclass` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForQuestionAnswering` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForSequenceClassification` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertForTokenClassification` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertModel` ¶

`mindnlp.transformers.models.megatron_bert.modeling_megatron_bert.MegatronBertPreTrainedModel` ¶

`mindnlp.transformers.models.megatron_bert.configuration_megatron_bert` ¶

`mindnlp.transformers.models.megatron_bert.configuration_megatron_bert.MegatronBertConfig` ¶