跳转至

mpnet

mindnlp.transformers.models.mpnet.configuration_mpnet

MPNet model configuration

mindnlp.transformers.models.mpnet.configuration_mpnet.MPNetConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [MPNetModel] or a [TFMPNetModel]. It is used to instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the MPNet microsoft/mpnet-base architecture. ```

Source code in mindnlp\transformers\models\mpnet\configuration_mpnet.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class MPNetConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MPNetModel`] or a [`TFMPNetModel`]. It is used to
    instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MPNet
    [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) architecture.
    ```"""
    model_type = "mpnet"

    def __init__(
        self,
        vocab_size=30527,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        relative_attention_num_buckets=32,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        **kwargs,
    ):
        """Initializes a new instance of the MPNetConfig class.

        Args:
            vocab_size (int, optional): The size of the vocabulary. Defaults to 30527.
            hidden_size (int, optional): The size of the hidden states. Defaults to 768.
            num_hidden_layers (int, optional): The number of hidden layers. Defaults to 12.
            num_attention_heads (int, optional): The number of attention heads. Defaults to 12.
            intermediate_size (int, optional): The size of the intermediate layer in the feedforward network. Defaults to 3072.
            hidden_act (str, optional): The activation function for the hidden layers. Defaults to 'gelu'.
            hidden_dropout_prob (float, optional): The dropout probability for the hidden layers. Defaults to 0.1.
            attention_probs_dropout_prob (float, optional): The dropout probability for the attention probabilities. Defaults to 0.1.
            max_position_embeddings (int, optional): The maximum number of positional embeddings. Defaults to 512.
            initializer_range (float, optional): The range for the random weight initialization. Defaults to 0.02.
            layer_norm_eps (float, optional): The epsilon value for layer normalization. Defaults to 1e-12.
            relative_attention_num_buckets (int, optional): The number of buckets for relative attention. Defaults to 32.
            pad_token_id (int, optional): The token ID for padding. Defaults to 1.
            bos_token_id (int, optional): The token ID for the beginning of sequence. Defaults to 0.
            eos_token_id (int, optional): The token ID for the end of sequence. Defaults to 2.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.relative_attention_num_buckets = relative_attention_num_buckets

mindnlp.transformers.models.mpnet.configuration_mpnet.MPNetConfig.__init__(vocab_size=30527, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, initializer_range=0.02, layer_norm_eps=1e-12, relative_attention_num_buckets=32, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs)

Initializes a new instance of the MPNetConfig class.

PARAMETER DESCRIPTION
vocab_size

The size of the vocabulary. Defaults to 30527.

TYPE: int DEFAULT: 30527

hidden_size

The size of the hidden states. Defaults to 768.

TYPE: int DEFAULT: 768

num_hidden_layers

The number of hidden layers. Defaults to 12.

TYPE: int DEFAULT: 12

num_attention_heads

The number of attention heads. Defaults to 12.

TYPE: int DEFAULT: 12

intermediate_size

The size of the intermediate layer in the feedforward network. Defaults to 3072.

TYPE: int DEFAULT: 3072

hidden_act

The activation function for the hidden layers. Defaults to 'gelu'.

TYPE: str DEFAULT: 'gelu'

hidden_dropout_prob

The dropout probability for the hidden layers. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

attention_probs_dropout_prob

The dropout probability for the attention probabilities. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

max_position_embeddings

The maximum number of positional embeddings. Defaults to 512.

TYPE: int DEFAULT: 512

initializer_range

The range for the random weight initialization. Defaults to 0.02.

TYPE: float DEFAULT: 0.02

layer_norm_eps

The epsilon value for layer normalization. Defaults to 1e-12.

TYPE: float DEFAULT: 1e-12

relative_attention_num_buckets

The number of buckets for relative attention. Defaults to 32.

TYPE: int DEFAULT: 32

pad_token_id

The token ID for padding. Defaults to 1.

TYPE: int DEFAULT: 1

bos_token_id

The token ID for the beginning of sequence. Defaults to 0.

TYPE: int DEFAULT: 0

eos_token_id

The token ID for the end of sequence. Defaults to 2.

TYPE: int DEFAULT: 2

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\mpnet\configuration_mpnet.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    vocab_size=30527,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    relative_attention_num_buckets=32,
    pad_token_id=1,
    bos_token_id=0,
    eos_token_id=2,
    **kwargs,
):
    """Initializes a new instance of the MPNetConfig class.

    Args:
        vocab_size (int, optional): The size of the vocabulary. Defaults to 30527.
        hidden_size (int, optional): The size of the hidden states. Defaults to 768.
        num_hidden_layers (int, optional): The number of hidden layers. Defaults to 12.
        num_attention_heads (int, optional): The number of attention heads. Defaults to 12.
        intermediate_size (int, optional): The size of the intermediate layer in the feedforward network. Defaults to 3072.
        hidden_act (str, optional): The activation function for the hidden layers. Defaults to 'gelu'.
        hidden_dropout_prob (float, optional): The dropout probability for the hidden layers. Defaults to 0.1.
        attention_probs_dropout_prob (float, optional): The dropout probability for the attention probabilities. Defaults to 0.1.
        max_position_embeddings (int, optional): The maximum number of positional embeddings. Defaults to 512.
        initializer_range (float, optional): The range for the random weight initialization. Defaults to 0.02.
        layer_norm_eps (float, optional): The epsilon value for layer normalization. Defaults to 1e-12.
        relative_attention_num_buckets (int, optional): The number of buckets for relative attention. Defaults to 32.
        pad_token_id (int, optional): The token ID for padding. Defaults to 1.
        bos_token_id (int, optional): The token ID for the beginning of sequence. Defaults to 0.
        eos_token_id (int, optional): The token ID for the end of sequence. Defaults to 2.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.relative_attention_num_buckets = relative_attention_num_buckets

mindnlp.transformers.models.mpnet.modeling_mpnet

MindSpore MPNet model.

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetClassificationHead

Bases: Module

Head for sentence-level classification tasks.

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
class MPNetClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to BERT's [CLS] token)
        x = self.dropout(x)
        x = self.dense(x)
        x = ops.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetEmbeddings

Bases: Module

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class MPNetEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.padding_idx = 1
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.register_buffer(
            "position_ids", ops.arange(config.max_position_embeddings).broadcast_to((1, -1)), persistent=False
        )

    def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
        if position_ids is None:
            if input_ids is not None:
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.shape
        else:
            input_shape = inputs_embeds.shape[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = inputs_embeds + position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: mindspore.Tensor

        Returns: mindspore.Tensor
        """
        input_shape = inputs_embeds.shape[:-1]
        sequence_length = input_shape[1]

        position_ids = ops.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=mindspore.int64
        )
        return position_ids.unsqueeze(0).broadcast_to(input_shape)

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetEmbeddings.create_position_ids_from_inputs_embeds(inputs_embeds)

We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

PARAMETER DESCRIPTION
inputs_embeds

mindspore.Tensor

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
    """
    We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

    Args:
        inputs_embeds: mindspore.Tensor

    Returns: mindspore.Tensor
    """
    input_shape = inputs_embeds.shape[:-1]
    sequence_length = input_shape[1]

    position_ids = ops.arange(
        self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=mindspore.int64
    )
    return position_ids.unsqueeze(0).broadcast_to(input_shape)

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForMaskedLM

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
class MPNetForMaskedLM(MPNetPreTrainedModel):
    _tied_weights_keys = ["lm_head.decoder"]

    def __init__(self, config):
        super().__init__(config)

        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        self.lm_head = MPNetLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings
        self.lm_head.bias = new_embeddings.bias

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], MaskedLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForMaskedLM.forward(input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the masked language modeling loss. Indices should be in [-100, 0, ..., config.vocab_size] (see input_ids docstring) Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size]

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], MaskedLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.mpnet(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]
    prediction_scores = self.lm_head(sequence_output)

    masked_lm_loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        output = (prediction_scores,) + outputs[2:]
        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

    return MaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForMultipleChoice

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
class MPNetForMultipleChoice(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.mpnet = MPNetModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        flat_input_ids = input_ids.view(-1, input_ids.shape[-1]) if input_ids is not None else None
        flat_position_ids = position_ids.view(-1, position_ids.shape[-1]) if position_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.shape[-1]) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])
            if inputs_embeds is not None
            else None
        )

        outputs = self.mpnet(
            flat_input_ids,
            position_ids=flat_position_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForMultipleChoice.forward(input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the multiple choice classification loss. Indices should be in [0, ..., num_choices-1] where num_choices is the size of the second dimension of the input tensors. (See input_ids above)

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], MultipleChoiceModelOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
        num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
        `input_ids` above)
    """

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

    flat_input_ids = input_ids.view(-1, input_ids.shape[-1]) if input_ids is not None else None
    flat_position_ids = position_ids.view(-1, position_ids.shape[-1]) if position_ids is not None else None
    flat_attention_mask = attention_mask.view(-1, attention_mask.shape[-1]) if attention_mask is not None else None
    flat_inputs_embeds = (
        inputs_embeds.view(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])
        if inputs_embeds is not None
        else None
    )

    outputs = self.mpnet(
        flat_input_ids,
        position_ids=flat_position_ids,
        attention_mask=flat_attention_mask,
        head_mask=head_mask,
        inputs_embeds=flat_inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    reshaped_logits = logits.view(-1, num_choices)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(reshaped_logits, labels)

    if not return_dict:
        output = (reshaped_logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return MultipleChoiceModelOutput(
        loss=loss,
        logits=reshaped_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForQuestionAnswering

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
class MPNetForQuestionAnswering(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        start_positions: Optional[mindspore.Tensor] = None,
        end_positions: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = ops.split(logits, 1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.shape) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.shape) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.shape[1]
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForQuestionAnswering.forward(input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None)

start_positions (mindspore.Tensor of shape (batch_size,), optional): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss. end_positions (mindspore.Tensor of shape (batch_size,), optional): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss.

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    start_positions: Optional[mindspore.Tensor] = None,
    end_positions: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], QuestionAnsweringModelOutput]:
    r"""
    start_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the start of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    end_positions (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the end of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    """

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.mpnet(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]

    logits = self.qa_outputs(sequence_output)
    start_logits, end_logits = ops.split(logits, 1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)

    total_loss = None
    if start_positions is not None and end_positions is not None:
        # If we are on multi-GPU, split add a dimension
        if len(start_positions.shape) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.shape) > 1:
            end_positions = end_positions.squeeze(-1)
        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.shape[1]
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2

    if not return_dict:
        output = (start_logits, end_logits) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

    return QuestionAnsweringModelOutput(
        loss=total_loss,
        start_logits=start_logits,
        end_logits=end_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForSequenceClassification

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
class MPNetForSequenceClassification(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        self.classifier = MPNetClassificationHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForSequenceClassification.forward(input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)

labels (mindspore.Tensor of shape (batch_size,), optional): Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], SequenceClassifierOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.mpnet(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    sequence_output = outputs[0]
    logits = self.classifier(sequence_output)

    loss = None
    if labels is not None:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForTokenClassification

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
class MPNetForTokenClassification(MPNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.mpnet = MPNetModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor], TokenClassifierOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetForTokenClassification.forward(input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None)

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for computing the token classification loss. Indices should be in [0, ..., config.num_labels - 1].

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], TokenClassifierOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
    """

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.mpnet(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = outputs[0]

    sequence_output = self.dropout(sequence_output)
    logits = self.classifier(sequence_output)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetLMHead

Bases: Module

MPNet Head for masked and permuted language modeling.

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
class MPNetLMHead(nn.Module):
    """MPNet Head for masked and permuted language modeling."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(ops.zeros(config.vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias

    def _tie_weights(self):
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)

        return x

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetModel

Bases: MPNetPreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
class MPNetModel(MPNetPreTrainedModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = MPNetEmbeddings(config)
        self.encoder = MPNetEncoder(config)
        self.pooler = MPNetPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple[mindspore.Tensor], BaseModelOutputWithPooling]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.shape
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")


        if attention_mask is None:
            attention_mask = ops.ones(input_shape)
        extended_attention_mask: mindspore.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds)
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

mindnlp.transformers.models.mpnet.modeling_mpnet.MPNetPreTrainedModel

Bases: PreTrainedModel

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class MPNetPreTrainedModel(PreTrainedModel):
    config_class = MPNetConfig
    base_model_prefix = "mpnet"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight[module.padding_idx] = 0
        elif isinstance(module, nn.LayerNorm):
            nn.init.zeros_(module.bias)
            nn.init.ones_(module.weight)

mindnlp.transformers.models.mpnet.modeling_mpnet.create_position_ids_from_input_ids(input_ids, padding_idx)

Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's utils.make_positions. :param mindspore.Tensor x: :return mindspore.Tensor:

Source code in mindnlp\transformers\models\mpnet\modeling_mpnet.py
915
916
917
918
919
920
921
922
923
def create_position_ids_from_input_ids(input_ids, padding_idx):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`. :param mindspore.Tensor x: :return mindspore.Tensor:
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = ops.cumsum(mask, dim=1).type_as(mask) * mask
    return incremental_indices.long() + padding_idx

mindnlp.transformers.models.mpnet.tokenization_mpnet

Tokenization classes for MPNet.

mindnlp.transformers.models.mpnet.tokenization_mpnet.BasicTokenizer

Bases: object

Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

PARAMETER DESCRIPTION
do_lower_case

Whether or not to lowercase the input when tokenizing.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

never_split

Collection of tokens which will never be split during tokenization. Only has an effect when do_basic_tokenize=True

TYPE: `Iterable`, *optional* DEFAULT: None

tokenize_chinese_chars

Whether or not to tokenize Chinese characters.

This should likely be deactivated for Japanese (see this issue).

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

strip_accents

Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for lowercase (as in the original BERT).

TYPE: `bool`, *optional* DEFAULT: None

do_split_on_punc

In some instances we want to skip the basic punctuation splitting so that later tokenization can capture the full context of the words, such as contractions.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """
    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        """
        Initializes a BasicTokenizer object with the specified parameters.

        Args:
            self: The instance of the BasicTokenizer class.
            do_lower_case (bool): A flag indicating whether text should be converted to lowercase. Default is True.
            never_split (list): A list of tokens that should never be split during tokenization. Default is an empty list.
            tokenize_chinese_chars (bool): A flag indicating whether to tokenize Chinese characters. Default is True.
            strip_accents (None): Not used in the current implementation.
            do_split_on_punc (bool): A flag indicating whether to split on punctuation marks. Default is True.

        Returns:
            None.

        Raises:
            None.
        """
        if never_split is None:
            never_split = []
        self.do_lower_case = do_lower_case
        self.never_split = set(never_split)
        self.tokenize_chinese_chars = tokenize_chinese_chars
        self.strip_accents = strip_accents
        self.do_split_on_punc = do_split_on_punc

    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # prevents treating the same character with different unicode codepoints as different characters
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        for token in orig_tokens:
            if token not in never_split:
                if self.do_lower_case:
                    token = token.lower()
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

mindnlp.transformers.models.mpnet.tokenization_mpnet.BasicTokenizer.__init__(do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None, do_split_on_punc=True)

Initializes a BasicTokenizer object with the specified parameters.

PARAMETER DESCRIPTION
self

The instance of the BasicTokenizer class.

do_lower_case

A flag indicating whether text should be converted to lowercase. Default is True.

TYPE: bool DEFAULT: True

never_split

A list of tokens that should never be split during tokenization. Default is an empty list.

TYPE: list DEFAULT: None

tokenize_chinese_chars

A flag indicating whether to tokenize Chinese characters. Default is True.

TYPE: bool DEFAULT: True

strip_accents

Not used in the current implementation.

TYPE: None DEFAULT: None

do_split_on_punc

A flag indicating whether to split on punctuation marks. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION

None.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def __init__(
    self,
    do_lower_case=True,
    never_split=None,
    tokenize_chinese_chars=True,
    strip_accents=None,
    do_split_on_punc=True,
):
    """
    Initializes a BasicTokenizer object with the specified parameters.

    Args:
        self: The instance of the BasicTokenizer class.
        do_lower_case (bool): A flag indicating whether text should be converted to lowercase. Default is True.
        never_split (list): A list of tokens that should never be split during tokenization. Default is an empty list.
        tokenize_chinese_chars (bool): A flag indicating whether to tokenize Chinese characters. Default is True.
        strip_accents (None): Not used in the current implementation.
        do_split_on_punc (bool): A flag indicating whether to split on punctuation marks. Default is True.

    Returns:
        None.

    Raises:
        None.
    """
    if never_split is None:
        never_split = []
    self.do_lower_case = do_lower_case
    self.never_split = set(never_split)
    self.tokenize_chinese_chars = tokenize_chinese_chars
    self.strip_accents = strip_accents
    self.do_split_on_punc = do_split_on_punc

mindnlp.transformers.models.mpnet.tokenization_mpnet.BasicTokenizer.tokenize(text, never_split=None)

Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
def tokenize(self, text, never_split=None):
    """
    Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

    Args:
        never_split (`List[str]`, *optional*)
            Kept for backward compatibility purposes. Now implemented directly at the base class level (see
            [`PreTrainedTokenizer.tokenize`]) List of token not to split.
    """
    # union() returns a new set by concatenating the two sets.
    never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
    text = self._clean_text(text)

    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    if self.tokenize_chinese_chars:
        text = self._tokenize_chinese_chars(text)
    # prevents treating the same character with different unicode codepoints as different characters
    unicode_normalized_text = unicodedata.normalize("NFC", text)
    orig_tokens = whitespace_tokenize(unicode_normalized_text)
    split_tokens = []
    for token in orig_tokens:
        if token not in never_split:
            if self.do_lower_case:
                token = token.lower()
                if self.strip_accents is not False:
                    token = self._run_strip_accents(token)
            elif self.strip_accents:
                token = self._run_strip_accents(token)
        split_tokens.extend(self._run_split_on_punc(token, never_split))

    output_tokens = whitespace_tokenize(" ".join(split_tokens))
    return output_tokens

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer

Bases: PreTrainedTokenizer

This tokenizer inherits from [BertTokenizer] which contains most of the methods. Users should refer to the superclass for more information regarding methods.

PARAMETER DESCRIPTION
vocab_file

Path to the vocabulary file.

TYPE: `str`

do_lower_case

Whether or not to lowercase the input when tokenizing.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

do_basic_tokenize

Whether or not to do basic tokenization before WordPiece.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

never_split

Collection of tokens which will never be split during tokenization. Only has an effect when do_basic_tokenize=True

TYPE: `Iterable`, *optional* DEFAULT: None

bos_token

The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.

When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the cls_token.

TYPE: `str`, *optional*, defaults to `"<s>"` DEFAULT: '<s>'

eos_token

The end of sequence token.

When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the sep_token.

TYPE: `str`, *optional*, defaults to `"</s>"` DEFAULT: '</s>'

sep_token

The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens.

TYPE: `str`, *optional*, defaults to `"</s>"` DEFAULT: '</s>'

cls_token

The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens.

TYPE: `str`, *optional*, defaults to `"<s>"` DEFAULT: '<s>'

unk_token

The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead.

TYPE: `str`, *optional*, defaults to `"[UNK]"` DEFAULT: '[UNK]'

pad_token

The token used for padding, for example when batching sequences of different lengths.

TYPE: `str`, *optional*, defaults to `"<pad>"` DEFAULT: '<pad>'

mask_token

The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict.

TYPE: `str`, *optional*, defaults to `"<mask>"` DEFAULT: '<mask>'

tokenize_chinese_chars

Whether or not to tokenize Chinese characters.

This should likely be deactivated for Japanese (see this issue).

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

strip_accents

Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for lowercase (as in the original BERT).

TYPE: `bool`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
class MPNetTokenizer(PreTrainedTokenizer):
    """

    This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="[UNK]",
        pad_token="<pad>",
        mask_token="<mask>",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        """
        This method initializes an instance of the MPNetTokenizer class.

        Args:
            self: The instance of the class.
            vocab_file (str): Path to the vocabulary file.
            do_lower_case (bool, optional): Whether to convert tokens to lowercase. Defaults to True.
            do_basic_tokenize (bool, optional): Whether to perform basic tokenization. Defaults to True.
            never_split (list, optional): List of tokens that should not be split. Defaults to None.
            bos_token (str, optional): Beginning of sequence token. Defaults to '<s>'.
            eos_token (str, optional): End of sequence token. Defaults to '</s>'.
            sep_token (str, optional): Separator token. Defaults to '</s>'.
            cls_token (str, optional): Classification token. Defaults to '<s>'.
            unk_token (str, optional): Token for unknown words. Defaults to '[UNK]'.
            pad_token (str, optional): Padding token. Defaults to '<pad>'.
            mask_token (str, optional): Mask token. Defaults to '<mask>'.
            tokenize_chinese_chars (bool, optional): Whether to tokenize Chinese characters. Defaults to True.
            strip_accents (str, optional): Method for stripping accents. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            ValueError: If the vocabulary file specified by 'vocab_file' cannot be found.
        """
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    def do_lower_case(self):
        """
        Method 'do_lower_case' in the class 'MPNetTokenizer'.
        This method converts the text to lowercase using the basic tokenizer provided by the MPNetTokenizer.

        Args:
            self: An instance of the MPNetTokenizer class.

        Returns:
            None.

        Raises:
            None
        """
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        """
        Returns the size of the vocabulary.

        Args:
            self (MPNetTokenizer): An instance of the MPNetTokenizer class.

        Returns:
            int: The size of the vocabulary.

        Raises:
            None.
        """
        return len(self.vocab)

    def get_vocab(self):
        """
        Method to retrieve the vocabulary from the MPNetTokenizer.

        Args:
            self: The instance of the MPNetTokenizer class.

        Returns:
            dict: A dictionary containing the combined vocabulary of added tokens and the original vocabulary.

        Raises:
            None
        """
        # "<mask>" is part of the vocab, but was wrongfully added at a wrong index in the fast saved version
        vocab = self.added_tokens_encoder.copy()
        vocab.update(self.vocab)
        return vocab

    def _tokenize(self, text):
        """
        Method to tokenize the input text using basic or wordpiece tokenizer.

        Args:
            self (MPNetTokenizer): An instance of the MPNetTokenizer class.
            text (str): The input text to be tokenized.

        Returns:
            list: A list of tokens after tokenization. If basic tokenization is enabled, tokens are split based on basic rules.
                If basic tokenization is disabled, tokens are split using the wordpiece tokenizer.

        Raises:
            None.
        """
        split_tokens = []
        if self.do_basic_tokenize:
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                # If the token is part of the never_split set
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A MPNet sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary to a file in the specified directory with an optional filename prefix.

        Args:
            self (MPNetTokenizer): The instance of the MPNetTokenizer class.
            save_directory (str): The directory path where the vocabulary file will be saved.
            filename_prefix (Optional[str]): An optional prefix to be added to the filename. Default is None.

        Returns:
            Tuple[str]: A tuple containing the path to the saved vocabulary file.

        Raises:
            IOError: If an error occurs while writing the vocabulary file.
        """
        index = 0
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
        return (vocab_file,)

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.do_lower_case property

Method 'do_lower_case' in the class 'MPNetTokenizer'. This method converts the text to lowercase using the basic tokenizer provided by the MPNetTokenizer.

PARAMETER DESCRIPTION
self

An instance of the MPNetTokenizer class.

RETURNS DESCRIPTION

None.

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.vocab_size property

Returns the size of the vocabulary.

PARAMETER DESCRIPTION
self

An instance of the MPNetTokenizer class.

TYPE: MPNetTokenizer

RETURNS DESCRIPTION
int

The size of the vocabulary.

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.__init__(vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, bos_token='<s>', eos_token='</s>', sep_token='</s>', cls_token='<s>', unk_token='[UNK]', pad_token='<pad>', mask_token='<mask>', tokenize_chinese_chars=True, strip_accents=None, **kwargs)

This method initializes an instance of the MPNetTokenizer class.

PARAMETER DESCRIPTION
self

The instance of the class.

vocab_file

Path to the vocabulary file.

TYPE: str

do_lower_case

Whether to convert tokens to lowercase. Defaults to True.

TYPE: bool DEFAULT: True

do_basic_tokenize

Whether to perform basic tokenization. Defaults to True.

TYPE: bool DEFAULT: True

never_split

List of tokens that should not be split. Defaults to None.

TYPE: list DEFAULT: None

bos_token

Beginning of sequence token. Defaults to ''.

TYPE: str DEFAULT: '<s>'

eos_token

End of sequence token. Defaults to ''.

TYPE: str DEFAULT: '</s>'

sep_token

Separator token. Defaults to ''.

TYPE: str DEFAULT: '</s>'

cls_token

Classification token. Defaults to ''.

TYPE: str DEFAULT: '<s>'

unk_token

Token for unknown words. Defaults to '[UNK]'.

TYPE: str DEFAULT: '[UNK]'

pad_token

Padding token. Defaults to ''.

TYPE: str DEFAULT: '<pad>'

mask_token

Mask token. Defaults to ''.

TYPE: str DEFAULT: '<mask>'

tokenize_chinese_chars

Whether to tokenize Chinese characters. Defaults to True.

TYPE: bool DEFAULT: True

strip_accents

Method for stripping accents. Defaults to None.

TYPE: str DEFAULT: None

**kwargs

Additional keyword arguments.

DEFAULT: {}

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the vocabulary file specified by 'vocab_file' cannot be found.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def __init__(
    self,
    vocab_file,
    do_lower_case=True,
    do_basic_tokenize=True,
    never_split=None,
    bos_token="<s>",
    eos_token="</s>",
    sep_token="</s>",
    cls_token="<s>",
    unk_token="[UNK]",
    pad_token="<pad>",
    mask_token="<mask>",
    tokenize_chinese_chars=True,
    strip_accents=None,
    **kwargs,
):
    """
    This method initializes an instance of the MPNetTokenizer class.

    Args:
        self: The instance of the class.
        vocab_file (str): Path to the vocabulary file.
        do_lower_case (bool, optional): Whether to convert tokens to lowercase. Defaults to True.
        do_basic_tokenize (bool, optional): Whether to perform basic tokenization. Defaults to True.
        never_split (list, optional): List of tokens that should not be split. Defaults to None.
        bos_token (str, optional): Beginning of sequence token. Defaults to '<s>'.
        eos_token (str, optional): End of sequence token. Defaults to '</s>'.
        sep_token (str, optional): Separator token. Defaults to '</s>'.
        cls_token (str, optional): Classification token. Defaults to '<s>'.
        unk_token (str, optional): Token for unknown words. Defaults to '[UNK]'.
        pad_token (str, optional): Padding token. Defaults to '<pad>'.
        mask_token (str, optional): Mask token. Defaults to '<mask>'.
        tokenize_chinese_chars (bool, optional): Whether to tokenize Chinese characters. Defaults to True.
        strip_accents (str, optional): Method for stripping accents. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        ValueError: If the vocabulary file specified by 'vocab_file' cannot be found.
    """
    bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
    eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
    sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
    cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
    unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
    pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

    # Mask token behave like a normal word, i.e. include the space before it
    mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

    if not os.path.isfile(vocab_file):
        raise ValueError(
            f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
            " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
        )
    self.vocab = load_vocab(vocab_file)
    self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
    self.do_basic_tokenize = do_basic_tokenize
    if do_basic_tokenize:
        self.basic_tokenizer = BasicTokenizer(
            do_lower_case=do_lower_case,
            never_split=never_split,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
        )
    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

    super().__init__(
        do_lower_case=do_lower_case,
        do_basic_tokenize=do_basic_tokenize,
        never_split=never_split,
        bos_token=bos_token,
        eos_token=eos_token,
        unk_token=unk_token,
        sep_token=sep_token,
        cls_token=cls_token,
        pad_token=pad_token,
        mask_token=mask_token,
        tokenize_chinese_chars=tokenize_chinese_chars,
        strip_accents=strip_accents,
        **kwargs,
    )

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A MPNet sequence has the following format:

  • single sequence: <s> X </s>
  • pair of sequences: <s> A </s></s> B </s>
PARAMETER DESCRIPTION
token_ids_0

List of IDs to which the special tokens will be added

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

RETURNS DESCRIPTION
List[int]

List[int]: list of input IDs with the appropriate special tokens.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def build_inputs_with_special_tokens(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
    """
    Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
    adding special tokens. A MPNet sequence has the following format:

    - single sequence: `<s> X </s>`
    - pair of sequences: `<s> A </s></s> B </s>`

    Args:
        token_ids_0 (`List[int]`):
            List of IDs to which the special tokens will be added
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.

    Returns:
        `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
    """
    if token_ids_1 is None:
        return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
    cls = [self.cls_token_id]
    sep = [self.sep_token_id]
    return cls + token_ids_0 + sep + sep + token_ids_1 + sep

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.convert_tokens_to_string(tokens)

Converts a sequence of tokens (string) in a single string.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
308
309
310
311
def convert_tokens_to_string(self, tokens):
    """Converts a sequence of tokens (string) in a single string."""
    out_string = " ".join(tokens).replace(" ##", "").strip()
    return out_string

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.create_token_type_ids_from_sequences(token_ids_0, token_ids_1=None)

Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not make use of token type ids, therefore a list of zeros is returned.

PARAMETER DESCRIPTION
token_ids_0

List of ids.

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

RETURNS DESCRIPTION
List[int]

List[int]: List of zeros.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
    """
    Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
    make use of token type ids, therefore a list of zeros is returned.

    Args:
        token_ids_0 (`List[int]`):
            List of ids.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.

    Returns:
        `List[int]`: List of zeros.
    """
    sep = [self.sep_token_id]
    cls = [self.cls_token_id]

    if token_ids_1 is None:
        return len(cls + token_ids_0 + sep) * [0]
    return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)

Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer prepare_for_model methods.

PARAMETER DESCRIPTION
token_ids_0

List of ids.

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

already_has_special_tokens

Set to True if the token list is already formatted with special tokens for the model

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

RETURNS DESCRIPTION
List[int]

List[int]: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
    """
    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer `prepare_for_model` methods.

    Args:
        token_ids_0 (`List[int]`):
            List of ids.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.
        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
            Set to True if the token list is already formatted with special tokens for the model

    Returns:
        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """
    if already_has_special_tokens:
        return super().get_special_tokens_mask(
            token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
        )

    if token_ids_1 is None:
        return [1] + ([0] * len(token_ids_0)) + [1]
    return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.get_vocab()

Method to retrieve the vocabulary from the MPNetTokenizer.

PARAMETER DESCRIPTION
self

The instance of the MPNetTokenizer class.

RETURNS DESCRIPTION
dict

A dictionary containing the combined vocabulary of added tokens and the original vocabulary.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def get_vocab(self):
    """
    Method to retrieve the vocabulary from the MPNetTokenizer.

    Args:
        self: The instance of the MPNetTokenizer class.

    Returns:
        dict: A dictionary containing the combined vocabulary of added tokens and the original vocabulary.

    Raises:
        None
    """
    # "<mask>" is part of the vocab, but was wrongfully added at a wrong index in the fast saved version
    vocab = self.added_tokens_encoder.copy()
    vocab.update(self.vocab)
    return vocab

mindnlp.transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer.save_vocabulary(save_directory, filename_prefix=None)

Save the vocabulary to a file in the specified directory with an optional filename prefix.

PARAMETER DESCRIPTION
self

The instance of the MPNetTokenizer class.

TYPE: MPNetTokenizer

save_directory

The directory path where the vocabulary file will be saved.

TYPE: str

filename_prefix

An optional prefix to be added to the filename. Default is None.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
Tuple[str]

Tuple[str]: A tuple containing the path to the saved vocabulary file.

RAISES DESCRIPTION
IOError

If an error occurs while writing the vocabulary file.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
    """
    Save the vocabulary to a file in the specified directory with an optional filename prefix.

    Args:
        self (MPNetTokenizer): The instance of the MPNetTokenizer class.
        save_directory (str): The directory path where the vocabulary file will be saved.
        filename_prefix (Optional[str]): An optional prefix to be added to the filename. Default is None.

    Returns:
        Tuple[str]: A tuple containing the path to the saved vocabulary file.

    Raises:
        IOError: If an error occurs while writing the vocabulary file.
    """
    index = 0
    if os.path.isdir(save_directory):
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
    else:
        vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
    with open(vocab_file, "w", encoding="utf-8") as writer:
        for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
            if index != token_index:
                logger.warning(
                    f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                    " Please check that the vocabulary is not corrupted!"
                )
                index = token_index
            writer.write(token + "\n")
            index += 1
    return (vocab_file,)

mindnlp.transformers.models.mpnet.tokenization_mpnet.WordpieceTokenizer

Bases: object

Runs WordPiece tokenization.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        """
        Initializes a new instance of the WordpieceTokenizer class.

        Args:
            self: The instance of the WordpieceTokenizer class.
            vocab (list): A list of vocabulary tokens used for tokenization.
            unk_token (str): The token to be used for representing unknown words.
            max_input_chars_per_word (int): The maximum number of characters allowed per input word. Defaults to 100.

        Returns:
            None.

        Raises:
            ValueError: If max_input_chars_per_word is less than or equal to 0.
            TypeError: If vocab is not a list or unk_token is not a string.
        """
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

mindnlp.transformers.models.mpnet.tokenization_mpnet.WordpieceTokenizer.__init__(vocab, unk_token, max_input_chars_per_word=100)

Initializes a new instance of the WordpieceTokenizer class.

PARAMETER DESCRIPTION
self

The instance of the WordpieceTokenizer class.

vocab

A list of vocabulary tokens used for tokenization.

TYPE: list

unk_token

The token to be used for representing unknown words.

TYPE: str

max_input_chars_per_word

The maximum number of characters allowed per input word. Defaults to 100.

TYPE: int DEFAULT: 100

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If max_input_chars_per_word is less than or equal to 0.

TypeError

If vocab is not a list or unk_token is not a string.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
    """
    Initializes a new instance of the WordpieceTokenizer class.

    Args:
        self: The instance of the WordpieceTokenizer class.
        vocab (list): A list of vocabulary tokens used for tokenization.
        unk_token (str): The token to be used for representing unknown words.
        max_input_chars_per_word (int): The maximum number of characters allowed per input word. Defaults to 100.

    Returns:
        None.

    Raises:
        ValueError: If max_input_chars_per_word is less than or equal to 0.
        TypeError: If vocab is not a list or unk_token is not a string.
    """
    self.vocab = vocab
    self.unk_token = unk_token
    self.max_input_chars_per_word = max_input_chars_per_word

mindnlp.transformers.models.mpnet.tokenization_mpnet.WordpieceTokenizer.tokenize(text)

Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary.

For example, input = "unaffable" wil return as output ["un", "##aff", "##able"].

PARAMETER DESCRIPTION
text

A single token or whitespace separated tokens. This should have already been passed through BasicTokenizer.

RETURNS DESCRIPTION

A list of wordpiece tokens.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
def tokenize(self, text):
    """
    Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
    tokenization using the given vocabulary.

    For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

    Args:
        text: A single token or whitespace separated tokens. This should have
            already been passed through *BasicTokenizer*.

    Returns:
        A list of wordpiece tokens.
    """
    output_tokens = []
    for token in whitespace_tokenize(text):
        chars = list(token)
        if len(chars) > self.max_input_chars_per_word:
            output_tokens.append(self.unk_token)
            continue

        is_bad = False
        start = 0
        sub_tokens = []
        while start < len(chars):
            end = len(chars)
            cur_substr = None
            while start < end:
                substr = "".join(chars[start:end])
                if start > 0:
                    substr = "##" + substr
                if substr in self.vocab:
                    cur_substr = substr
                    break
                end -= 1
            if cur_substr is None:
                is_bad = True
                break
            sub_tokens.append(cur_substr)
            start = end

        if is_bad:
            output_tokens.append(self.unk_token)
        else:
            output_tokens.extend(sub_tokens)
    return output_tokens

mindnlp.transformers.models.mpnet.tokenization_mpnet.load_vocab(vocab_file)

Loads a vocabulary file into a dictionary.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
50
51
52
53
54
55
56
57
58
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

mindnlp.transformers.models.mpnet.tokenization_mpnet.whitespace_tokenize(text)

Runs basic whitespace cleaning and splitting on a piece of text.

Source code in mindnlp\transformers\models\mpnet\tokenization_mpnet.py
61
62
63
64
65
66
67
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens