mamba

`mindnlp.transformers.models.mamba.modeling_mamba` ¶

MindSpore MAMBA model.

`mindnlp.transformers.models.mamba.modeling_mamba.MambaCausalLMOutput` `dataclass` ¶

Bases: ModelOutput

Base class for causal language model (or autoregressive) outputs.

PARAMETER	DESCRIPTION
`loss`	Language modeling loss (for next-token prediction). TYPE: `mindspore.Tensor` of shape `(1,)`, optional, returned when `labels` is provided DEFAULT: `None`
`logits`	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)` DEFAULT: `None`
`cache_params`	The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to avoid providing the old `input_ids`. Includes both the State space model state matrices after the selective scan, and the Convolutional states TYPE: `MambaCache` DEFAULT: `None`
`hidden_states`	Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` DEFAULT: `None`

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

@dataclass
class MambaCausalLMOutput(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    loss: Optional[mindspore.Tensor] = None
    logits: Optional[mindspore.Tensor] = None
    cache_params: Optional[MambaCache] = None
    hidden_states: Optional[Tuple[mindspore.Tensor]] = None

`mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM` ¶

Bases: MambaPreTrainedModel

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

class MambaForCausalLM(MambaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.backbone = MambaModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def get_input_embeddings(self):
        return self.backbone.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        return self.backbone.set_input_embeddings(new_embeddings)

    def _update_model_kwargs_for_generation(
        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], num_new_tokens: int = 1, **kwargs
    ) -> Dict[str, Any]:
        model_kwargs["cache_params"] = outputs.get("cache_params", None)
        if (
            model_kwargs.get("use_cache", True)
            and "cache_position" in model_kwargs
            and model_kwargs["cache_position"] is not None
        ):
            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens

        if "attention_mask" in model_kwargs:
            attention_mask = model_kwargs["attention_mask"]
            model_kwargs["attention_mask"] = ops.cat(
                [attention_mask, ops.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype)], dim=-1
            )

        return model_kwargs

    def prepare_inputs_for_generation(
        self,
        input_ids,
        inputs_embeds=None,
        use_cache=None,
        cache_params: Optional[MambaCache] = None,
        cache_position: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        **kwargs,
    ):
        if use_cache:
            # `cache_position` should have been initialized in `generate`
            if cache_position is None:
                raise ValueError(
                    "`cache_position` should not be None as it should have been initialized in "
                    "`model.generate`, you are responsible for passing in a valid `cache_position` if "
                    "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`"
                )
            if cache_position[0] > 0:
                input_ids = input_ids[:, -1].unsqueeze(-1)

                if attention_mask is not None:
                    attention_mask = None

            else:
                # we initialize the `cache_position` to full size of `conv_states` at prefill stage
                # considering padding will be applied when input length is shorter, and truncation
                # will be applied when it is longer, so it will be equivalent to always have it match
                # the length of `cache_params.conv_states`, which is `config.conv_kernel`
                cache_position = ops.arange(0, self.config.conv_kernel)

        if inputs_embeds is not None and cache_params is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "cache_params": cache_params,
                "use_cache": use_cache,
                "cache_position": cache_position,
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        cache_params: Optional[MambaCache] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        use_cache: Optional[bool] = None,
        cache_position: Optional[mindspore.Tensor] = None,
        **kwargs,  # for now we need this for generation
    ) -> Union[Tuple, MambaCausalLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        mamba_outputs = self.backbone(
            input_ids,
            cache_params=cache_params,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            use_cache=use_cache,
            cache_position=cache_position,
            attention_mask=attention_mask,
        )
        hidden_states = mamba_outputs[0]

        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

        if not return_dict:
            output = (logits,) + mamba_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return MambaCausalLMOutput(
            loss=loss,
            logits=logits,
            cache_params=mamba_outputs.cache_params,
            hidden_states=mamba_outputs.hidden_states,
        )

`mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, use_cache=None, cache_position=None, **kwargs)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    cache_params: Optional[MambaCache] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    use_cache: Optional[bool] = None,
    cache_position: Optional[mindspore.Tensor] = None,
    **kwargs,  # for now we need this for generation
) -> Union[Tuple, MambaCausalLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    mamba_outputs = self.backbone(
        input_ids,
        cache_params=cache_params,
        inputs_embeds=inputs_embeds,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        use_cache=use_cache,
        cache_position=cache_position,
        attention_mask=attention_mask,
    )
    hidden_states = mamba_outputs[0]

    logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

    if not return_dict:
        output = (logits,) + mamba_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return MambaCausalLMOutput(
        loss=loss,
        logits=logits,
        cache_params=mamba_outputs.cache_params,
        hidden_states=mamba_outputs.hidden_states,
    )

`mindnlp.transformers.models.mamba.modeling_mamba.MambaMixer` ¶

Bases: Module

Compute ∆, A, B, C, and D the state space parameters and compute the contextualized_states. A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, and is why Mamba is called selective state spaces)

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

class MambaMixer(nn.Module):
    """
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    """

    def __init__(self, config: MambaConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.ssm_state_size = config.state_size
        self.conv_kernel_size = config.conv_kernel
        self.intermediate_size = config.intermediate_size
        self.time_step_rank = int(config.time_step_rank)
        self.layer_idx = layer_idx
        self.use_conv_bias = config.use_conv_bias
        self.conv1d = nn.Conv1d(
            in_channels=self.intermediate_size,
            out_channels=self.intermediate_size,
            bias=config.use_conv_bias,
            kernel_size=config.conv_kernel,
            groups=self.intermediate_size,
            padding=config.conv_kernel - 1,
        )

        self.activation = config.hidden_act
        self.act = ACT2FN[config.hidden_act]

        # projection of the input hidden states
        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
        # selective projection used to make dt, B and C input dependant
        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
        # time step projection (discretization)
        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)

        # S4D real initialization. These are not discretized!
        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
        A = ops.arange(1, self.ssm_state_size + 1, dtype=mindspore.float32)[None, :]
        A = A.broadcast_to((self.intermediate_size, -1))

        self.A_log = nn.Parameter(ops.log(A))
        self.D = nn.Parameter(ops.ones(self.intermediate_size))
        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
        self.use_bias = config.use_bias

    # fmt: off
    def slow_forward(self, input_states, cache_params: Optional[MambaCache]=None, cache_position:Optional[mindspore.Tensor]=None, attention_mask: Optional[mindspore.Tensor] = None):
        batch_size, seq_len, _ = input_states.shape
        dtype = input_states.dtype
        # 1. Gated MLP's linear projection
        projected_states = ops.transpose(self.in_proj(input_states), 1, 2)                   # [batch, 2 * intermediate_size, seq_len]
        hidden_states, gate = ops.chunk(projected_states, 2, dim=1)

        if attention_mask is not None:
            hidden_states = hidden_states * attention_mask.unsqueeze(1)

        # 2. Convolution sequence transformation
        if cache_params is not None:
            ssm_state = cache_params.ssm_states[self.layer_idx].copy()
            # use `cache_position.shape[0]` to check whether we are in prefill
            # stage, it's equivalent to check `cache_position[0] == 0`, which
            # breaks dynamo fullgraph constraints
            if cache_position.shape[0] == self.conv_kernel_size:
                if self.conv_kernel_size - hidden_states.shape[-1] > 0:
                    conv_state = nn.functional.pad(
                        hidden_states,
                        (self.conv_kernel_size - hidden_states.shape[-1], 0)
                    )
                else:
                    conv_state = hidden_states[:, :, hidden_states.shape[-1] - self.conv_kernel_size:]

                cache_params.update_conv_state(self.layer_idx, conv_state, cache_position)
                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
            else:
                conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position)
                hidden_states = ops.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
                if self.use_conv_bias:
                    hidden_states += self.conv1d.bias
                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
        else:
            ssm_state = ops.zeros(
                (batch_size, self.intermediate_size, self.ssm_state_size), dtype=dtype
            )
            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]

        if attention_mask is not None:
            hidden_states = hidden_states * attention_mask.unsqueeze(1)

        # 3. State Space Model sequence transformation
        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
        ssm_parameters = self.x_proj(ops.transpose(hidden_states, 1, 2))
        time_step, B, C = ops.split(
            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
        )
        discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
        discrete_time_step = ops.transpose(nn.functional.softplus(discrete_time_step), 1, 2) # [batch, intermediate_size, seq_len]

        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
        A = -ops.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
        discrete_A = ops.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediate_size, seq_len, ssm_state_size]
        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()


        scan_outputs = []
        for i in range(seq_len):
            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediade_size, ssm_state]
            scan_output = ops.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
            scan_outputs.append(scan_output[:, :, 0])
        scan_output = ops.stack(scan_outputs, dim=-1)                                # [batch, seq_len, intermediade_size]
        scan_output = scan_output + (hidden_states * self.D[None, :, None])
        scan_output = (scan_output * self.act(gate))

        if cache_params is not None:
            cache_params.ssm_states[self.layer_idx] = ssm_state.to(cache_params.ssm_states[self.layer_idx].dtype)

        # 4. Final linear projection
        contextualized_states = self.out_proj(ops.transpose(scan_output, 1, 2))  # [batch, seq_len, hidden_size]
        return contextualized_states
    # fmt: on

    def forward(
        self,
        hidden_states,
        cache_params: Optional[MambaCache] = None,
        cache_position: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
    ):
        return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask)

`mindnlp.transformers.models.mamba.modeling_mamba.MambaOutput` `dataclass` ¶

Bases: ModelOutput

Class for the MAMBA model outputs.

PARAMETER	DESCRIPTION
`last_hidden_state`	Sequence of hidden-states at the output of the last layer of the model. TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)` DEFAULT: `None`
`cache_params`	The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to avoid providing the old `input_ids`. Includes both the State space model state matrices after the selective scan, and the Convolutional states TYPE: `MambaCache` DEFAULT: `None`
`hidden_states`	Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` DEFAULT: `None`

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

@dataclass
class MambaOutput(ModelOutput):
    """
    Class for the MAMBA model outputs.

    Args:
        last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    last_hidden_state: Optional[mindspore.Tensor] = None
    cache_params: Optional[MambaCache] = None
    hidden_states: Optional[Tuple[mindspore.Tensor]] = None

`mindnlp.transformers.models.mamba.modeling_mamba.MambaPreTrainedModel` ¶

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

class MambaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = MambaConfig
    base_model_prefix = "backbone"
    _no_split_modules = ["MambaBlock", "MambaMixer"]
    supports_gradient_checkpointing = True
    _is_stateful = True

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, MambaMixer):
            module.A_log._no_weight_decay = True
            module.D._no_weight_decay = True

            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
            if self.config.time_step_init_scheme == "constant":
                nn.init.constant_(module.dt_proj.weight, dt_init_std)
            elif self.config.time_step_init_scheme == "random":
                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)

            dt = ops.exp(
                ops.rand(self.config.intermediate_size)
                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
                + math.log(self.config.time_step_min)
            ).clamp(min=self.config.time_step_floor)
            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
            inv_dt = dt + ops.log(-ops.expm1(-dt))
            with no_grad():
                module.dt_proj.bias.assign_value(inv_dt)
            module.dt_proj.bias._no_reinit = True

        if isinstance(module, nn.Linear):
            if module.bias is not None:
                if not getattr(module.bias, "_no_reinit", False):
                    nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, std=self.config.initializer_range)

        if self.config.rescale_prenorm_residual:
            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
            #
            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            for name, p in module.named_parameters():
                if name in ["out_proj.weight"]:
                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
                    # We need to reinit p since this code could be called multiple times
                    # Having just p *= scale would repeatedly scale it down
                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
                    with no_grad():
                        p /= math.sqrt(self.config.num_hidden_layers)

`mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm` ¶

Bases: Module

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

class MambaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        """
        super().__init__()
        self.weight = nn.Parameter(ops.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(mindspore.float32)
        variance = ops.mean(hidden_states.pow(2), -1, keepdim=True)
        hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{self.weight.shape[0]}, eps={self.variance_epsilon}"

`mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm.init(hidden_size, eps=1e-06)` ¶

MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm

Source code in mindnlp\transformers\models\mamba\modeling_mamba.py

def __init__(self, hidden_size, eps=1e-6):
    """
    MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
    """
    super().__init__()
    self.weight = nn.Parameter(ops.ones(hidden_size))
    self.variance_epsilon = eps

`mindnlp.transformers.models.mamba.configuration_mamba` ¶

MAMBA configuration

`mindnlp.transformers.models.mamba.configuration_mamba.MambaConfig` ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [MambaModel]. It is used to instantiate a MAMBA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the MAMBA state-spaces/mamba-2.8b architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER	DESCRIPTION
`vocab_size`	Vocabulary size of the MAMBA model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`MambaModel`]. TYPE: `int`, optional, defaults to 50280 DEFAULT: `50280`
`hidden_size`	Dimensionality of the embeddings and hidden states. TYPE: `int`, optional, defaults to 768 DEFAULT: `768`
`state_size`	shape of the state space latents. TYPE: `int`, optional, defaults to 16 DEFAULT: `16`
`num_hidden_layers`	Number of hidden layers in the model. TYPE: `int`, optional, defaults to 32 DEFAULT: `32`
`layer_norm_epsilon`	The epsilon to use in the layer normalization layers. TYPE: `float`, optional, defaults to 1e-05 DEFAULT: `1e-05`
`pad_token_id`	Padding token id. TYPE: `int`, optional, defaults to 0 DEFAULT: `0`
`bos_token_id`	The id of the beginning of sentence token in the vocabulary. TYPE: `int`, optional, defaults to 0 DEFAULT: `0`
`eos_token_id`	The id of the end of sentence token in the vocabulary. TYPE: `int`, optional, defaults to 0 DEFAULT: `0`
`expand`	Expanding factor used to determine the intermediate size. TYPE: `int`, optional, defaults to 2 DEFAULT: `2`
`conv_kernel`	Size of the convolution kernel. TYPE: `int`, optional, defaults to 4 DEFAULT: `4`
`use_bias`	Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`use_conv_bias`	Whether or not to use bias in the convolution layer of the mixer block. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`hidden_act`	The non-linear activation function (function or string) in the decoder. TYPE: `str`, optional, defaults to `"silu"` DEFAULT: `'silu'`
`initializer_range`	The standard deviation of the truncated_normal_initializer for initializing all weight matrices. TYPE: `float`, optional, defaults to 0.1 DEFAULT: `0.1`
`residual_in_fp32`	Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`time_step_rank`	Rank of the the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` TYPE: `Union[int,str]`, optional, defaults to `"auto"` DEFAULT: `'auto'`
`time_step_scale`	Scale used used to scale `dt_proj.bias`. TYPE: `float`, optional, defaults to 1.0 DEFAULT: `1.0`
`time_step_min`	Minimum `time_step` used to bound `dt_proj.bias`. TYPE: `float`, optional, defaults to 0.001 DEFAULT: `0.001`
`time_step_max`	Maximum `time_step` used to bound `dt_proj.bias`. TYPE: `float`, optional, defaults to 0.1 DEFAULT: `0.1`
`time_step_init_scheme`	Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]` TYPE: `float`, optional, defaults to `"random"` DEFAULT: `'random'`
`time_step_floor`	Minimum clamping value of the `dt_proj.bias` layer initialization. TYPE: `float`, optional, defaults to 0.0001 DEFAULT: `0.0001`
`rescale_prenorm_residual`	Whether or not to rescale `out_proj` weights when initializing. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`use_cache`	Whether or not the cache should be used. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`

Example

>>> from transformers import MambaConfig, MambaModel
...
>>> # Initializing a Mamba configuration
>>> configuration = MambaConfig()
...
>>> # Initializing a model (with random weights) from the configuration
>>> model = MambaModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in mindnlp\transformers\models\mamba\configuration_mamba.py

class MambaConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MambaModel`]. It is used to instantiate a MAMBA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the MAMBA
    [state-spaces/mamba-2.8b](https://hf-mirror.com/state-spaces/mamba-2.8b) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50280):
            Vocabulary size of the MAMBA model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MambaModel`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the model.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
            The epsilon to use in the layer normalization layers.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 0):
            The id of the beginning of sentence token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 0):
            The id of the end of sentence token in the vocabulary.
        expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
        conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
        use_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether or not to use bias in the convolution layer of the mixer block.
        hidden_act (`str`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.1):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype`
            as the rest of the model
        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
            Rank of the the discretization projection matrix. `"auto"` means that it will default to
            `math.ceil(self.hidden_size / 16)`
        time_step_scale (`float`, *optional*, defaults to 1.0):
            Scale used used to scale `dt_proj.bias`.
        time_step_min (`float`, *optional*, defaults to 0.001):
            Minimum `time_step` used to bound `dt_proj.bias`.
        time_step_max (`float`, *optional*, defaults to 0.1):
            Maximum `time_step` used to bound `dt_proj.bias`.
        time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
            Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
        time_step_floor (`float`, *optional*, defaults to 0.0001):
            Minimum clamping value of the `dt_proj.bias` layer initialization.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
            Whether or not to rescale `out_proj` weights when initializing.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the cache should be used.


    Example:
        ```python
        >>> from transformers import MambaConfig, MambaModel
        ...
        >>> # Initializing a Mamba configuration
        >>> configuration = MambaConfig()
        ...
        >>> # Initializing a model (with random weights) from the configuration
        >>> model = MambaModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "mamba"

    def __init__(
        self,
        vocab_size=50280,
        hidden_size=768,
        state_size=16,
        num_hidden_layers=32,
        layer_norm_epsilon=1e-5,
        pad_token_id=0,
        bos_token_id=0,
        eos_token_id=0,
        expand=2,
        conv_kernel=4,
        use_bias=False,
        use_conv_bias=True,
        hidden_act="silu",
        initializer_range=0.1,
        residual_in_fp32=True,
        time_step_rank="auto",
        time_step_scale=1.0,
        time_step_min=0.001,
        time_step_max=0.1,
        time_step_init_scheme="random",
        time_step_floor=1e-4,
        rescale_prenorm_residual=False,
        use_cache=True,
        **kwargs,
    ):
        '''
        Initializes a new instance of the MambaConfig class.

        Args:
            self (MambaConfig): The current instance of the MambaConfig class.
            vocab_size (int, optional): The size of the vocabulary. Defaults to 50280.
            hidden_size (int, optional): The size of the hidden state. Defaults to 768.
            state_size (int, optional): The size of the state. Defaults to 16.
            num_hidden_layers (int, optional): The number of hidden layers. Defaults to 32.
            layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Defaults to 1e-05.
            pad_token_id (int, optional): The token ID for padding. Defaults to 0.
            bos_token_id (int, optional): The token ID for the beginning of sequence. Defaults to 0.
            eos_token_id (int, optional): The token ID for the end of sequence. Defaults to 0.
            expand (int, optional): The expansion factor. Defaults to 2.
            conv_kernel (int, optional): The kernel size for convolution. Defaults to 4.
            use_bias (bool, optional): Whether to use bias. Defaults to False.
            use_conv_bias (bool, optional): Whether to use bias in convolution. Defaults to True.
            hidden_act (str, optional): The activation function for hidden layers. Defaults to 'silu'.
            initializer_range (float, optional): The range for weight initialization. Defaults to 0.1.
            residual_in_fp32 (bool, optional): Whether to keep residual in FP32. Defaults to True.
            time_step_rank (str or int, optional): The rank or 'auto' for time step. Defaults to 'auto'.
            time_step_scale (float, optional): The scale factor for time step. Defaults to 1.0.
            time_step_min (float, optional): The minimum value for time step. Defaults to 0.001.
            time_step_max (float, optional): The maximum value for time step. Defaults to 0.1.
            time_step_init_scheme (str, optional): The initialization scheme for time step. Defaults to 'random'.
            time_step_floor (float, optional): The floor value for time step. Defaults to 0.0001.
            rescale_prenorm_residual (bool, optional): Whether to rescale pre-norm residuals. Defaults to False.
            use_cache (bool, optional): Whether to use cache. Defaults to True.

        Returns:
            None.

        Raises:
            None.
        '''
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.num_hidden_layers = num_hidden_layers
        self.layer_norm_epsilon = layer_norm_epsilon
        self.conv_kernel = conv_kernel
        self.expand = expand
        self.intermediate_size = int(expand * self.hidden_size)
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.pad_token_id = pad_token_id
        self.use_bias = use_bias
        self.use_conv_bias = use_conv_bias
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
        self.time_step_scale = time_step_scale
        self.time_step_min = time_step_min
        self.time_step_max = time_step_max
        self.time_step_init_scheme = time_step_init_scheme
        self.time_step_floor = time_step_floor
        self.rescale_prenorm_residual = rescale_prenorm_residual
        self.residual_in_fp32 = residual_in_fp32
        self.use_cache = use_cache

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)

mindnlp.transformers.models.mamba.configuration_mamba.MambaConfig.init(vocab_size=50280, hidden_size=768, state_size=16, num_hidden_layers=32, layer_norm_epsilon=1e-05, pad_token_id=0, bos_token_id=0, eos_token_id=0, expand=2, conv_kernel=4, use_bias=False, use_conv_bias=True, hidden_act='silu', initializer_range=0.1, residual_in_fp32=True, time_step_rank='auto', time_step_scale=1.0, time_step_min=0.001, time_step_max=0.1, time_step_init_scheme='random', time_step_floor=0.0001, rescale_prenorm_residual=False, use_cache=True, **kwargs) ¶

Initializes a new instance of the MambaConfig class.

PARAMETER	DESCRIPTION
`self`	The current instance of the MambaConfig class. TYPE: `MambaConfig`
`vocab_size`	The size of the vocabulary. Defaults to 50280. TYPE: `int` DEFAULT: `50280`
`hidden_size`	The size of the hidden state. Defaults to 768. TYPE: `int` DEFAULT: `768`
`state_size`	The size of the state. Defaults to 16. TYPE: `int` DEFAULT: `16`
`num_hidden_layers`	The number of hidden layers. Defaults to 32. TYPE: `int` DEFAULT: `32`
`layer_norm_epsilon`	The epsilon value for layer normalization. Defaults to 1e-05. TYPE: `float` DEFAULT: `1e-05`
`pad_token_id`	The token ID for padding. Defaults to 0. TYPE: `int` DEFAULT: `0`
`bos_token_id`	The token ID for the beginning of sequence. Defaults to 0. TYPE: `int` DEFAULT: `0`
`eos_token_id`	The token ID for the end of sequence. Defaults to 0. TYPE: `int` DEFAULT: `0`
`expand`	The expansion factor. Defaults to 2. TYPE: `int` DEFAULT: `2`
`conv_kernel`	The kernel size for convolution. Defaults to 4. TYPE: `int` DEFAULT: `4`
`use_bias`	Whether to use bias. Defaults to False. TYPE: `bool` DEFAULT: `False`
`use_conv_bias`	Whether to use bias in convolution. Defaults to True. TYPE: `bool` DEFAULT: `True`
`hidden_act`	The activation function for hidden layers. Defaults to 'silu'. TYPE: `str` DEFAULT: `'silu'`
`initializer_range`	The range for weight initialization. Defaults to 0.1. TYPE: `float` DEFAULT: `0.1`
`residual_in_fp32`	Whether to keep residual in FP32. Defaults to True. TYPE: `bool` DEFAULT: `True`
`time_step_rank`	The rank or 'auto' for time step. Defaults to 'auto'. TYPE: `str or int` DEFAULT: `'auto'`
`time_step_scale`	The scale factor for time step. Defaults to 1.0. TYPE: `float` DEFAULT: `1.0`
`time_step_min`	The minimum value for time step. Defaults to 0.001. TYPE: `float` DEFAULT: `0.001`
`time_step_max`	The maximum value for time step. Defaults to 0.1. TYPE: `float` DEFAULT: `0.1`
`time_step_init_scheme`	The initialization scheme for time step. Defaults to 'random'. TYPE: `str` DEFAULT: `'random'`
`time_step_floor`	The floor value for time step. Defaults to 0.0001. TYPE: `float` DEFAULT: `0.0001`
`rescale_prenorm_residual`	Whether to rescale pre-norm residuals. Defaults to False. TYPE: `bool` DEFAULT: `False`
`use_cache`	Whether to use cache. Defaults to True. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\configuration_mamba.py

def __init__(
    self,
    vocab_size=50280,
    hidden_size=768,
    state_size=16,
    num_hidden_layers=32,
    layer_norm_epsilon=1e-5,
    pad_token_id=0,
    bos_token_id=0,
    eos_token_id=0,
    expand=2,
    conv_kernel=4,
    use_bias=False,
    use_conv_bias=True,
    hidden_act="silu",
    initializer_range=0.1,
    residual_in_fp32=True,
    time_step_rank="auto",
    time_step_scale=1.0,
    time_step_min=0.001,
    time_step_max=0.1,
    time_step_init_scheme="random",
    time_step_floor=1e-4,
    rescale_prenorm_residual=False,
    use_cache=True,
    **kwargs,
):
    '''
    Initializes a new instance of the MambaConfig class.

    Args:
        self (MambaConfig): The current instance of the MambaConfig class.
        vocab_size (int, optional): The size of the vocabulary. Defaults to 50280.
        hidden_size (int, optional): The size of the hidden state. Defaults to 768.
        state_size (int, optional): The size of the state. Defaults to 16.
        num_hidden_layers (int, optional): The number of hidden layers. Defaults to 32.
        layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Defaults to 1e-05.
        pad_token_id (int, optional): The token ID for padding. Defaults to 0.
        bos_token_id (int, optional): The token ID for the beginning of sequence. Defaults to 0.
        eos_token_id (int, optional): The token ID for the end of sequence. Defaults to 0.
        expand (int, optional): The expansion factor. Defaults to 2.
        conv_kernel (int, optional): The kernel size for convolution. Defaults to 4.
        use_bias (bool, optional): Whether to use bias. Defaults to False.
        use_conv_bias (bool, optional): Whether to use bias in convolution. Defaults to True.
        hidden_act (str, optional): The activation function for hidden layers. Defaults to 'silu'.
        initializer_range (float, optional): The range for weight initialization. Defaults to 0.1.
        residual_in_fp32 (bool, optional): Whether to keep residual in FP32. Defaults to True.
        time_step_rank (str or int, optional): The rank or 'auto' for time step. Defaults to 'auto'.
        time_step_scale (float, optional): The scale factor for time step. Defaults to 1.0.
        time_step_min (float, optional): The minimum value for time step. Defaults to 0.001.
        time_step_max (float, optional): The maximum value for time step. Defaults to 0.1.
        time_step_init_scheme (str, optional): The initialization scheme for time step. Defaults to 'random'.
        time_step_floor (float, optional): The floor value for time step. Defaults to 0.0001.
        rescale_prenorm_residual (bool, optional): Whether to rescale pre-norm residuals. Defaults to False.
        use_cache (bool, optional): Whether to use cache. Defaults to True.

    Returns:
        None.

    Raises:
        None.
    '''
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.state_size = state_size
    self.num_hidden_layers = num_hidden_layers
    self.layer_norm_epsilon = layer_norm_epsilon
    self.conv_kernel = conv_kernel
    self.expand = expand
    self.intermediate_size = int(expand * self.hidden_size)
    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.pad_token_id = pad_token_id
    self.use_bias = use_bias
    self.use_conv_bias = use_conv_bias
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
    self.time_step_scale = time_step_scale
    self.time_step_min = time_step_min
    self.time_step_max = time_step_max
    self.time_step_init_scheme = time_step_init_scheme
    self.time_step_floor = time_step_floor
    self.rescale_prenorm_residual = rescale_prenorm_residual
    self.residual_in_fp32 = residual_in_fp32
    self.use_cache = use_cache

    super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)

`mindnlp.transformers.models.mamba.modeling_graph_mamba` ¶

MindSpore MAMBA model.

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock` ¶

Bases: Module

The MSMambaBlock class represents a block for the MSMamba model. It inherits from the nn.Module class and is designed to handle the configuration and processing of hidden states for the MSMamba model.

ATTRIBUTE	DESCRIPTION
`config`	An object containing configuration settings for the block.
`layer_idx`	An integer representing the index of the layer.
`residual_in_fp32`	A boolean indicating whether residual values are in 32-bit floating point format.
`norm`	An instance of the MSMambaRMSNorm class for performing layer normalization.
`mixer`	An instance of the MSMambaMixer class for mixing hidden states based on the configuration and layer index.

METHOD	DESCRIPTION
`forward`	Processes the input hidden states using the configured normalization and mixing operations, and returns the processed hidden states.

Note

This class is part of the MSMamba model and is specifically designed for handling the processing of hidden states within the model architecture.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaBlock(nn.Module):

    """
    The MSMambaBlock class represents a block for the MSMamba model. It inherits from the nn.Module class and is designed
    to handle the configuration and processing of hidden states for the MSMamba model.

    Attributes:
        config: An object containing configuration settings for the block.
        layer_idx: An integer representing the index of the layer.
        residual_in_fp32: A boolean indicating whether residual values are in 32-bit floating point format.
        norm: An instance of the MSMambaRMSNorm class for performing layer normalization.
        mixer: An instance of the MSMambaMixer class for mixing hidden states based on the configuration and layer index.

    Methods:
        forward: Processes the input hidden states using the configured normalization and mixing operations,
            and returns the processed hidden states.

    Note:
        This class is part of the MSMamba model and is specifically designed for handling the processing of
        hidden states within the model architecture.
    """
    def __init__(self, config, layer_idx):

        """
        Initializes a new instance of the MSMambaBlock class.

        Args:
            self (MSMambaBlock): The instance of the MSMambaBlock class.
            config (object): The configuration object containing various settings.
            layer_idx (int): The index of the layer in the model.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.residual_in_fp32 = config.residual_in_fp32
        self.norm = MSMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.mixer = MSMambaMixer(config, layer_idx=layer_idx)

    def forward(self, hidden_states, cache_params=None):

        """
        Constructs the MSMambaBlock.

        Args:
            self (MSMambaBlock): An instance of the MSMambaBlock class.
            hidden_states (Tensor): The input hidden states to the block.
            cache_params (Optional[Dict]): A dictionary containing cache parameters (default: None).

        Returns:
            None.

        Raises:
            None.
        """
        residual = hidden_states
        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
        if self.residual_in_fp32:
            residual = residual.to(mindspore.float32)

        hidden_states = self.mixer(hidden_states, cache_params=cache_params)
        hidden_states = residual + hidden_states
        return hidden_states

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.init(config, layer_idx)` ¶

Initializes a new instance of the MSMambaBlock class.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaBlock class. TYPE: `MSMambaBlock`
`config`	The configuration object containing various settings. TYPE: `object`
`layer_idx`	The index of the layer in the model. TYPE: `int`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, config, layer_idx):

    """
    Initializes a new instance of the MSMambaBlock class.

    Args:
        self (MSMambaBlock): The instance of the MSMambaBlock class.
        config (object): The configuration object containing various settings.
        layer_idx (int): The index of the layer in the model.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.config = config
    self.layer_idx = layer_idx
    self.residual_in_fp32 = config.residual_in_fp32
    self.norm = MSMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
    self.mixer = MSMambaMixer(config, layer_idx=layer_idx)

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.forward(hidden_states, cache_params=None)` ¶

Constructs the MSMambaBlock.

PARAMETER	DESCRIPTION
`self`	An instance of the MSMambaBlock class. TYPE: `MSMambaBlock`
`hidden_states`	The input hidden states to the block. TYPE: `Tensor`
`cache_params`	A dictionary containing cache parameters (default: None). TYPE: `Optional[Dict]` DEFAULT: `None`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def forward(self, hidden_states, cache_params=None):

    """
    Constructs the MSMambaBlock.

    Args:
        self (MSMambaBlock): An instance of the MSMambaBlock class.
        hidden_states (Tensor): The input hidden states to the block.
        cache_params (Optional[Dict]): A dictionary containing cache parameters (default: None).

    Returns:
        None.

    Raises:
        None.
    """
    residual = hidden_states
    hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
    if self.residual_in_fp32:
        residual = residual.to(mindspore.float32)

    hidden_states = self.mixer(hidden_states, cache_params=cache_params)
    hidden_states = residual + hidden_states
    return hidden_states

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache` ¶

The MSMambaCache class represents a cache for storing intermediate states and parameters used in the MSMamba algorithm. It is designed to be used in conjunction with the MSMambaModel class.

This class provides functionality for initializing the cache and storing intermediate states and parameters. The cache is used to store the convolutional states (conv_states) and the state-space model states (ssm_states) for each hidden layer in the MSMamba algorithm. The cache is initialized with zero tensors of appropriate shapes.

ATTRIBUTE	DESCRIPTION
`seqlen_offset`	A parameter representing the sequence length offset.
`dtype`	The data type of the cache tensors (default: mindspore.float16).
`conv_states`	A parameter storing the convolutional states for each hidden layer. It is a tensor of shape (num_hidden_layers, batch_size, intermediate_size, conv_kernel_size).
`ssm_states`	A parameter storing the state-space model states for each hidden layer. It is a tensor of shape (num_hidden_layers, batch_size, intermediate_size, ssm_state_size).

Note

This class inherits from [Parent Class Name].

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

@mindspore.jit_class
class MSMambaCache:

    """
    The `MSMambaCache` class represents a cache for storing intermediate states and parameters used in the
    MSMamba algorithm. It is designed to be used in conjunction with the `MSMambaModel` class.

    This class provides functionality for initializing the cache and storing intermediate states and parameters.
    The cache is used to store the convolutional states (`conv_states`) and the state-space model
    states (`ssm_states`) for each hidden layer in the MSMamba algorithm.
    The cache is initialized with zero tensors of appropriate shapes.

    Attributes:
        `seqlen_offset`: A parameter representing the sequence length offset.
        `dtype`: The data type of the cache tensors (default: mindspore.float16).
        `conv_states`: A parameter storing the convolutional states for each hidden layer.
            It is a tensor of shape (num_hidden_layers, batch_size, intermediate_size, conv_kernel_size).
        `ssm_states`: A parameter storing the state-space model states for each hidden layer.
            It is a tensor of shape (num_hidden_layers, batch_size, intermediate_size, ssm_state_size).

    Note:
        This class inherits from [Parent Class Name].
    """
    def __init__(self, config, batch_size, dtype=mindspore.float16):

        """
        This method initializes an instance of the MSMambaCache class.

        Args:
            self (object): The instance of the class.
            config (object): The configuration object containing parameters for the cache.
            batch_size (int): The size of the batch for processing.
            dtype (object, optional): The data type for the cache, defaults to mindspore.float16.

        Returns:
            None.

        Raises:
            ValueError: If the batch_size is not a positive integer.
            TypeError: If the dtype is not a valid data type.
        """
        self.seqlen_offset = Parameter(0)
        self.dtype = dtype
        intermediate_size = config.intermediate_size
        ssm_state_size = config.state_size
        conv_kernel_size = config.conv_kernel

        self.conv_states = Parameter(ops.zeros(config.num_hidden_layers, batch_size, intermediate_size, conv_kernel_size, dtype=dtype), name='conv_states')
        # {
        #     i: ops.zeros(batch_size, intermediate_size, conv_kernel_size, dtype=dtype)
        #     for i in range(config.num_hidden_layers)
        # }
        self.ssm_states = Parameter(ops.zeros(config.num_hidden_layers, batch_size, intermediate_size, ssm_state_size, dtype=dtype), name='ssm_states')

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache.init(config, batch_size, dtype=mindspore.float16)` ¶

This method initializes an instance of the MSMambaCache class.

PARAMETER	DESCRIPTION
`self`	The instance of the class. TYPE: `object`
`config`	The configuration object containing parameters for the cache. TYPE: `object`
`batch_size`	The size of the batch for processing. TYPE: `int`
`dtype`	The data type for the cache, defaults to mindspore.float16. TYPE: `object` DEFAULT: `float16`

RETURNS	DESCRIPTION
	None.

RAISES	DESCRIPTION
`ValueError`	If the batch_size is not a positive integer.
`TypeError`	If the dtype is not a valid data type.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, config, batch_size, dtype=mindspore.float16):

    """
    This method initializes an instance of the MSMambaCache class.

    Args:
        self (object): The instance of the class.
        config (object): The configuration object containing parameters for the cache.
        batch_size (int): The size of the batch for processing.
        dtype (object, optional): The data type for the cache, defaults to mindspore.float16.

    Returns:
        None.

    Raises:
        ValueError: If the batch_size is not a positive integer.
        TypeError: If the dtype is not a valid data type.
    """
    self.seqlen_offset = Parameter(0)
    self.dtype = dtype
    intermediate_size = config.intermediate_size
    ssm_state_size = config.state_size
    conv_kernel_size = config.conv_kernel

    self.conv_states = Parameter(ops.zeros(config.num_hidden_layers, batch_size, intermediate_size, conv_kernel_size, dtype=dtype), name='conv_states')
    # {
    #     i: ops.zeros(batch_size, intermediate_size, conv_kernel_size, dtype=dtype)
    #     for i in range(config.num_hidden_layers)
    # }
    self.ssm_states = Parameter(ops.zeros(config.num_hidden_layers, batch_size, intermediate_size, ssm_state_size, dtype=dtype), name='ssm_states')

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM` ¶

Bases: MSMambaPreTrainedModel

MSMambaForCausalLM is a class that represents a Mamba model for Causal Language Modeling. It inherits from MSMambaPreTrainedModel and includes methods for setting and getting input and output embeddings, as well as preparing inputs for generation and forwarding the model for training and evaluation.

The class includes the following methods:

init: Initializes the model with a given configuration.
get_output_embeddings: Retrieves the output embeddings of the model.
set_output_embeddings: Sets new output embeddings for the model.
get_input_embeddings: Retrieves the input embeddings of the model.
set_input_embeddings: Sets new input embeddings for the model.
_update_model_kwargs_for_generation: Updates model keyword arguments for generation.
prepare_inputs_for_generation: Prepares inputs for generation based on the given parameters.
forward: Constructs the model for training and evaluation, including handling labels for language modeling and computing loss.

When utilizing the MSMambaForCausalLM class, users can easily manage input and output embeddings, prepare inputs for generating text, and forward the model for training and evaluation purposes.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaForCausalLM(MSMambaPreTrainedModel):

    """
    MSMambaForCausalLM is a class that represents a Mamba model for Causal Language Modeling.
    It inherits from MSMambaPreTrainedModel and includes methods for setting and getting input and output embeddings,
    as well as preparing inputs for generation and forwarding the model for training and evaluation.

    The class includes the following methods:

    - __init__: Initializes the model with a given configuration.
    - get_output_embeddings: Retrieves the output embeddings of the model.
    - set_output_embeddings: Sets new output embeddings for the model.
    - get_input_embeddings: Retrieves the input embeddings of the model.
    - set_input_embeddings: Sets new input embeddings for the model.
    - _update_model_kwargs_for_generation: Updates model keyword arguments for generation.
    - prepare_inputs_for_generation: Prepares inputs for generation based on the given parameters.
    - forward: Constructs the model for training and evaluation, including handling labels for
    language modeling and computing loss.

    When utilizing the MSMambaForCausalLM class, users can easily manage input and output embeddings,
    prepare inputs for generating text, and forward the model for training and evaluation purposes.
    """
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):

        """
        Initializes an instance of MSMambaForCausalLM.

        Args:
            self (object): The instance of the class.
            config (object): An object containing configuration parameters.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.backbone = MSMambaModel(config)
        self.lm_head = MambaDense(config.hidden_size, config.vocab_size, bias=False)
        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):

        """
        Method to retrieve the output embeddings from the MSMambaForCausalLM model.

        Args:
            self: The instance of the MSMambaForCausalLM class.

        Returns:
            None.

        Raises:
            None.
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):

        """
        Sets the output embeddings of the MSMambaForCausalLM model.

        Args:
            self (MSMambaForCausalLM): The MSMambaForCausalLM object.
            new_embeddings (Tensor): The new embeddings to be set as the output embeddings.

        Returns:
            None.

        Raises:
            None.

        This method allows for setting the output embeddings of the MSMambaForCausalLM model.
        The output embeddings are used in the generation of predictions by the language model head. By setting new
        embeddings, you can modify the characteristics of the generated predictions.
        """
        self.lm_head = new_embeddings

    def get_input_embeddings(self):

        """
        Retrieve the input embeddings from the MSMambaForCausalLM model.

        Args:
            self (MSMambaForCausalLM): An instance of the MSMambaForCausalLM class.

        Returns:
            None.

        Raises:
            None.
        """
        return self.backbone.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):

        """
        Sets the input embeddings for the MSMambaForCausalLM model.

        Args:
            self (MSMambaForCausalLM): The instance of the MSMambaForCausalLM class.
            new_embeddings (Tensor): The new input embeddings to be set for the model.
                Should be a tensor of shape (vocab_size, embedding_dim).

        Returns:
            None: The method sets the input embeddings for the model and does not return any value.

        Raises:
            ValueError: If the new_embeddings tensor does not have the correct shape (vocab_size, embedding_dim).
            TypeError: If the new_embeddings parameter is not a tensor.
            RuntimeError: If the operation to set the input embeddings fails for any reason.
        """
        return self.backbone.set_input_embeddings(new_embeddings)

    def _update_model_kwargs_for_generation(
        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs
    ) -> Dict[str, Any]:

        """
        Updates model keyword arguments for generation.

        Args:
            self (MSMambaForCausalLM): The instance of MSMambaForCausalLM class.
            outputs (ModelOutput): The output results from the model.
            model_kwargs (Dict[str, Any]): The dictionary containing model keyword arguments.

        Returns:
            Dict[str, Any]:
                Updated model keyword arguments after incorporating cache_params from outputs.

        Raises:
            None
        """
        model_kwargs["cache_params"] = outputs.get("cache_params", None)
        return model_kwargs

    def prepare_inputs_for_generation(
        self, input_ids, cache_params=None, inputs_embeds=None, **kwargs
    ):

        """
        Prepare inputs for generation.

        Args:
            self (MSMambaForCausalLM): The instance of the MSMambaForCausalLM class.
            input_ids (Tensor): The input tensor containing tokenized input sequence.
            cache_params (dict, optional): Parameters for caching intermediate computations.
            inputs_embeds (Tensor, optional): The embedded input tensor.

        Returns:
            dict: The model inputs containing either 'inputs_embeds' or 'input_ids' based on the availability of
                'inputs_embeds' and 'cache_params'.

        Raises:
            None
        """
        # only last token for inputs_ids if the state is passed along.
        if cache_params is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)

        if inputs_embeds is not None and cache_params is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs["cache_params"] = cache_params
        return model_inputs

    @mindspore.jit
    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        cache_params: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # for now we need this for generation
    ) -> Union[Tuple, Dict]:
        r"""
        Args:
            labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
                `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
                are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        mamba_outputs = self.backbone(
            input_ids,
            cache_params=cache_params,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        if return_dict:
            hidden_states = mamba_outputs['last_hidden_state']
        else:
            hidden_states = mamba_outputs[0]

        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()

        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

        if not return_dict:
            output = (logits,) + mamba_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return {
            'loss': loss,
            'logits': logits,
            'cache_params': mamba_outputs['cache_params'],
            'hidden_states': mamba_outputs['hidden_states'],
        }

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.init(config)` ¶

Initializes an instance of MSMambaForCausalLM.

PARAMETER	DESCRIPTION
`self`	The instance of the class. TYPE: `object`
`config`	An object containing configuration parameters. TYPE: `object`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, config):

    """
    Initializes an instance of MSMambaForCausalLM.

    Args:
        self (object): The instance of the class.
        config (object): An object containing configuration parameters.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.backbone = MSMambaModel(config)
    self.lm_head = MambaDense(config.hidden_size, config.vocab_size, bias=False)
    # Initialize weights and apply final processing
    self.post_init()

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.forward(input_ids=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, **kwargs)` ¶

PARAMETER DESCRIPTION

labels

Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

@mindspore.jit
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    cache_params: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    **kwargs,  # for now we need this for generation
) -> Union[Tuple, Dict]:
    r"""
    Args:
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    mamba_outputs = self.backbone(
        input_ids,
        cache_params=cache_params,
        inputs_embeds=inputs_embeds,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    if return_dict:
        hidden_states = mamba_outputs['last_hidden_state']
    else:
        hidden_states = mamba_outputs[0]

    logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()

    loss = None
    if labels is not None:
        # move labels to correct device to enable model parallelism
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss = F.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

    if not return_dict:
        output = (logits,) + mamba_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return {
        'loss': loss,
        'logits': logits,
        'cache_params': mamba_outputs['cache_params'],
        'hidden_states': mamba_outputs['hidden_states'],
    }

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_input_embeddings()` ¶

Retrieve the input embeddings from the MSMambaForCausalLM model.

PARAMETER	DESCRIPTION
`self`	An instance of the MSMambaForCausalLM class. TYPE: `MSMambaForCausalLM`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def get_input_embeddings(self):

    """
    Retrieve the input embeddings from the MSMambaForCausalLM model.

    Args:
        self (MSMambaForCausalLM): An instance of the MSMambaForCausalLM class.

    Returns:
        None.

    Raises:
        None.
    """
    return self.backbone.get_input_embeddings()

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_output_embeddings()` ¶

Method to retrieve the output embeddings from the MSMambaForCausalLM model.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaForCausalLM class.

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def get_output_embeddings(self):

    """
    Method to retrieve the output embeddings from the MSMambaForCausalLM model.

    Args:
        self: The instance of the MSMambaForCausalLM class.

    Returns:
        None.

    Raises:
        None.
    """
    return self.lm_head

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.prepare_inputs_for_generation(input_ids, cache_params=None, inputs_embeds=None, **kwargs)` ¶

Prepare inputs for generation.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaForCausalLM class. TYPE: `MSMambaForCausalLM`
`input_ids`	The input tensor containing tokenized input sequence. TYPE: `Tensor`
`cache_params`	Parameters for caching intermediate computations. TYPE: `dict` DEFAULT: `None`
`inputs_embeds`	The embedded input tensor. TYPE: `Tensor` DEFAULT: `None`

RETURNS	DESCRIPTION
`dict`	The model inputs containing either 'inputs_embeds' or 'input_ids' based on the availability of 'inputs_embeds' and 'cache_params'.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def prepare_inputs_for_generation(
    self, input_ids, cache_params=None, inputs_embeds=None, **kwargs
):

    """
    Prepare inputs for generation.

    Args:
        self (MSMambaForCausalLM): The instance of the MSMambaForCausalLM class.
        input_ids (Tensor): The input tensor containing tokenized input sequence.
        cache_params (dict, optional): Parameters for caching intermediate computations.
        inputs_embeds (Tensor, optional): The embedded input tensor.

    Returns:
        dict: The model inputs containing either 'inputs_embeds' or 'input_ids' based on the availability of
            'inputs_embeds' and 'cache_params'.

    Raises:
        None
    """
    # only last token for inputs_ids if the state is passed along.
    if cache_params is not None:
        input_ids = input_ids[:, -1].unsqueeze(-1)

    if inputs_embeds is not None and cache_params is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs["cache_params"] = cache_params
    return model_inputs

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_input_embeddings(new_embeddings)` ¶

Sets the input embeddings for the MSMambaForCausalLM model.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaForCausalLM class. TYPE: `MSMambaForCausalLM`
`new_embeddings`	The new input embeddings to be set for the model. Should be a tensor of shape (vocab_size, embedding_dim). TYPE: `Tensor`

RETURNS	DESCRIPTION
`None`	The method sets the input embeddings for the model and does not return any value.

RAISES	DESCRIPTION
`ValueError`	If the new_embeddings tensor does not have the correct shape (vocab_size, embedding_dim).
`TypeError`	If the new_embeddings parameter is not a tensor.
`RuntimeError`	If the operation to set the input embeddings fails for any reason.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def set_input_embeddings(self, new_embeddings):

    """
    Sets the input embeddings for the MSMambaForCausalLM model.

    Args:
        self (MSMambaForCausalLM): The instance of the MSMambaForCausalLM class.
        new_embeddings (Tensor): The new input embeddings to be set for the model.
            Should be a tensor of shape (vocab_size, embedding_dim).

    Returns:
        None: The method sets the input embeddings for the model and does not return any value.

    Raises:
        ValueError: If the new_embeddings tensor does not have the correct shape (vocab_size, embedding_dim).
        TypeError: If the new_embeddings parameter is not a tensor.
        RuntimeError: If the operation to set the input embeddings fails for any reason.
    """
    return self.backbone.set_input_embeddings(new_embeddings)

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_output_embeddings(new_embeddings)` ¶

Sets the output embeddings of the MSMambaForCausalLM model.

PARAMETER	DESCRIPTION
`self`	The MSMambaForCausalLM object. TYPE: `MSMambaForCausalLM`
`new_embeddings`	The new embeddings to be set as the output embeddings. TYPE: `Tensor`

RETURNS	DESCRIPTION
	None.

This method allows for setting the output embeddings of the MSMambaForCausalLM model. The output embeddings are used in the generation of predictions by the language model head. By setting new embeddings, you can modify the characteristics of the generated predictions.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def set_output_embeddings(self, new_embeddings):

    """
    Sets the output embeddings of the MSMambaForCausalLM model.

    Args:
        self (MSMambaForCausalLM): The MSMambaForCausalLM object.
        new_embeddings (Tensor): The new embeddings to be set as the output embeddings.

    Returns:
        None.

    Raises:
        None.

    This method allows for setting the output embeddings of the MSMambaForCausalLM model.
    The output embeddings are used in the generation of predictions by the language model head. By setting new
    embeddings, you can modify the characteristics of the generated predictions.
    """
    self.lm_head = new_embeddings

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer` ¶

Bases: Module

Compute ∆, A, B, C, and D the state space parameters and compute the contextualized_states. A, D are input independent (see MSMamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) ∆, B, C are input-dependent (this is a key difference between MSMamba and the linear time invariant S4, and is why MSMamba is called selective state spaces)

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaMixer(nn.Module):
    """
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see MSMamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between MSMamba and the linear time invariant S4,
    and is why MSMamba is called **selective** state spaces)
    """
    def __init__(self, config, layer_idx):
        """
        Initializes an instance of the MSMambaMixer class.

        Args:
            self: The instance of the class.
            config:
                An object containing configuration parameters for the mixer.

                - hidden_size (int): Size of the hidden layer.
                - state_size (int): Size of the state.
                - conv_kernel (int): Kernel size for convolution.
                - intermediate_size (int): Size of the intermediate layer.
                - time_step_rank (int): Rank of the time step.
                - use_conv_bias (bool): Indicates whether to use bias in convolution.
                - hidden_act (str): Activation function for the hidden layer.
                - use_bias (bool): Indicates whether to use bias in the dense layers.
            layer_idx: Index of the current layer.

        Returns:
            None

        Raises:
            None
        """
        super().__init__()
        self.hidden_size = config.hidden_size
        self.ssm_state_size = config.state_size
        self.conv_kernel_size = config.conv_kernel
        self.intermediate_size = config.intermediate_size
        self.time_step_rank = config.time_step_rank
        self.layer_idx = layer_idx
        self.use_conv_bias = config.use_conv_bias
        self.conv1d = nn.Conv1d(
            in_channels=self.intermediate_size,
            out_channels=self.intermediate_size,
            bias=config.use_conv_bias,
            kernel_size=config.conv_kernel,
            groups=self.intermediate_size,
            padding=config.conv_kernel - 1,
        )

        self.activation = config.hidden_act
        self.act = ACT2FN[config.hidden_act]

        # projection of the input hidden states
        self.in_proj = MambaDense(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
        # selective projection used to make dt, B and C input dependant
        self.x_proj = MambaDense(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
        # time step projection (discretization)
        self.dt_proj = MambaDense(self.time_step_rank, self.intermediate_size, bias=True)

        # S4D real initialization. These are not discretized!
        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
        A = ops.arange(1, self.ssm_state_size + 1, dtype=mindspore.float32)[None, :]
        A = A.expand(self.intermediate_size, -1)

        self.A_log = Parameter(ops.log(A))
        self.D = Parameter(ops.ones(self.intermediate_size))
        self.out_proj = MambaDense(self.intermediate_size, self.hidden_size, bias=config.use_bias)
        self.use_bias = config.use_bias

    # fmt: off
    def forward(self, input_states, cache_params=None):

        """
        Constructs contextualized states using the MSMambaMixer algorithm.

        Args:
            self (MSMambaMixer): An instance of the MSMambaMixer class.
            input_states (ndarray): The input states of shape (batch_size, seq_len, _).
            cache_params (Optional[CacheParams]): The cache parameters. Defaults to None.

        Returns:
            None

        Raises:
            None
        """
        batch_size, seq_len, _ = input_states.shape
        dtype = input_states.dtype
        # 1. Gated MLP's linear projection
        projected_states = self.in_proj(input_states).swapaxes(1, 2)                   # [batch, 2 * intermediate_size, seq_len]
        hidden_states, gate = projected_states.chunk(2, axis=1)

        # 2. Convolution sequence transformation
        if cache_params is not None:
            ssm_state = cache_params.ssm_states[self.layer_idx]
            if cache_params.seqlen_offset > 0:
                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
                conv_state = ops.roll(conv_state, shifts=-1, dims=-1)
                conv_state[:, :, -1] = hidden_states[:, :, 0]
                cache_params.conv_states[self.layer_idx] = conv_state
                hidden_states = ops.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
                if self.use_conv_bias:
                    hidden_states += self.conv1d.bias
                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
            else:
                conv_state = ops.pad(
                    hidden_states,
                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
                )
                cache_params.conv_states[self.layer_idx] = conv_state
                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
        else:
            ssm_state = ops.zeros(
                (batch_size, self.intermediate_size, self.ssm_state_size), dtype=dtype
            )
            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]

        # 3. State Space Model sequence transformation
        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
        ssm_parameters = self.x_proj(hidden_states.swapaxes(1, 2))
        time_step, B, C = ops.split(
            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
        )
        discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
        discrete_time_step = ops.softplus(discrete_time_step).swapaxes(1, 2) # [batch, intermediate_size, seq_len]

        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
        A = -ops.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
        discrete_A = ops.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediade_size, seq_len, ssm_state_size]
        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()

        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
        scan_outputs = []
        for i in range(seq_len):
            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediade_size, ssm_state]
            scan_output = ops.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
            scan_outputs.append(scan_output[:, :, 0])
        scan_output = ops.stack(scan_outputs, dim=-1)                                # [batch, seq_len, intermediade_size]
        scan_output = scan_output + (hidden_states * self.D[None, :, None])
        scan_output = scan_output * self.act(gate)

        if cache_params is not None:
            cache_params.ssm_states[self.layer_idx] = ssm_state

        # 4. Final linear projection
        contextualized_states = self.out_proj(scan_output.swapaxes(1, 2))             # [batch, seq_len, hidden_size]
        return contextualized_states

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.init(config, layer_idx)` ¶

Initializes an instance of the MSMambaMixer class.

PARAMETER	DESCRIPTION
`self`	The instance of the class.
`config`	An object containing configuration parameters for the mixer. hidden_size (int): Size of the hidden layer. state_size (int): Size of the state. conv_kernel (int): Kernel size for convolution. intermediate_size (int): Size of the intermediate layer. time_step_rank (int): Rank of the time step. use_conv_bias (bool): Indicates whether to use bias in convolution. hidden_act (str): Activation function for the hidden layer. use_bias (bool): Indicates whether to use bias in the dense layers.
`layer_idx`	Index of the current layer.

RETURNS	DESCRIPTION
	None

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, config, layer_idx):
    """
    Initializes an instance of the MSMambaMixer class.

    Args:
        self: The instance of the class.
        config:
            An object containing configuration parameters for the mixer.

            - hidden_size (int): Size of the hidden layer.
            - state_size (int): Size of the state.
            - conv_kernel (int): Kernel size for convolution.
            - intermediate_size (int): Size of the intermediate layer.
            - time_step_rank (int): Rank of the time step.
            - use_conv_bias (bool): Indicates whether to use bias in convolution.
            - hidden_act (str): Activation function for the hidden layer.
            - use_bias (bool): Indicates whether to use bias in the dense layers.
        layer_idx: Index of the current layer.

    Returns:
        None

    Raises:
        None
    """
    super().__init__()
    self.hidden_size = config.hidden_size
    self.ssm_state_size = config.state_size
    self.conv_kernel_size = config.conv_kernel
    self.intermediate_size = config.intermediate_size
    self.time_step_rank = config.time_step_rank
    self.layer_idx = layer_idx
    self.use_conv_bias = config.use_conv_bias
    self.conv1d = nn.Conv1d(
        in_channels=self.intermediate_size,
        out_channels=self.intermediate_size,
        bias=config.use_conv_bias,
        kernel_size=config.conv_kernel,
        groups=self.intermediate_size,
        padding=config.conv_kernel - 1,
    )

    self.activation = config.hidden_act
    self.act = ACT2FN[config.hidden_act]

    # projection of the input hidden states
    self.in_proj = MambaDense(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
    # selective projection used to make dt, B and C input dependant
    self.x_proj = MambaDense(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
    # time step projection (discretization)
    self.dt_proj = MambaDense(self.time_step_rank, self.intermediate_size, bias=True)

    # S4D real initialization. These are not discretized!
    # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
    A = ops.arange(1, self.ssm_state_size + 1, dtype=mindspore.float32)[None, :]
    A = A.expand(self.intermediate_size, -1)

    self.A_log = Parameter(ops.log(A))
    self.D = Parameter(ops.ones(self.intermediate_size))
    self.out_proj = MambaDense(self.intermediate_size, self.hidden_size, bias=config.use_bias)
    self.use_bias = config.use_bias

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.forward(input_states, cache_params=None)` ¶

Constructs contextualized states using the MSMambaMixer algorithm.

PARAMETER	DESCRIPTION
`self`	An instance of the MSMambaMixer class. TYPE: `MSMambaMixer`
`input_states`	The input states of shape (batch_size, seq_len, _). TYPE: `ndarray`
`cache_params`	The cache parameters. Defaults to None. TYPE: `Optional[CacheParams]` DEFAULT: `None`

RETURNS	DESCRIPTION
	None

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def forward(self, input_states, cache_params=None):

    """
    Constructs contextualized states using the MSMambaMixer algorithm.

    Args:
        self (MSMambaMixer): An instance of the MSMambaMixer class.
        input_states (ndarray): The input states of shape (batch_size, seq_len, _).
        cache_params (Optional[CacheParams]): The cache parameters. Defaults to None.

    Returns:
        None

    Raises:
        None
    """
    batch_size, seq_len, _ = input_states.shape
    dtype = input_states.dtype
    # 1. Gated MLP's linear projection
    projected_states = self.in_proj(input_states).swapaxes(1, 2)                   # [batch, 2 * intermediate_size, seq_len]
    hidden_states, gate = projected_states.chunk(2, axis=1)

    # 2. Convolution sequence transformation
    if cache_params is not None:
        ssm_state = cache_params.ssm_states[self.layer_idx]
        if cache_params.seqlen_offset > 0:
            conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
            conv_state = ops.roll(conv_state, shifts=-1, dims=-1)
            conv_state[:, :, -1] = hidden_states[:, :, 0]
            cache_params.conv_states[self.layer_idx] = conv_state
            hidden_states = ops.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
            if self.use_conv_bias:
                hidden_states += self.conv1d.bias
            hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
        else:
            conv_state = ops.pad(
                hidden_states,
                (self.conv_kernel_size - hidden_states.shape[-1], 0)
            )
            cache_params.conv_states[self.layer_idx] = conv_state
            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
    else:
        ssm_state = ops.zeros(
            (batch_size, self.intermediate_size, self.ssm_state_size), dtype=dtype
        )
        hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]

    # 3. State Space Model sequence transformation
    # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
    ssm_parameters = self.x_proj(hidden_states.swapaxes(1, 2))
    time_step, B, C = ops.split(
        ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
    )
    discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
    discrete_time_step = ops.softplus(discrete_time_step).swapaxes(1, 2) # [batch, intermediate_size, seq_len]

    # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
    A = -ops.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
    discrete_A = ops.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
    discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediade_size, seq_len, ssm_state_size]
    deltaB_u = discrete_B * hidden_states[:, :, :, None].float()

    # 3.c perform the recurrence y ← SSM(A, B, C)(x)
    scan_outputs = []
    for i in range(seq_len):
        ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediade_size, ssm_state]
        scan_output = ops.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
        scan_outputs.append(scan_output[:, :, 0])
    scan_output = ops.stack(scan_outputs, dim=-1)                                # [batch, seq_len, intermediade_size]
    scan_output = scan_output + (hidden_states * self.D[None, :, None])
    scan_output = scan_output * self.act(gate)

    if cache_params is not None:
        cache_params.ssm_states[self.layer_idx] = ssm_state

    # 4. Final linear projection
    contextualized_states = self.out_proj(scan_output.swapaxes(1, 2))             # [batch, seq_len, hidden_size]
    return contextualized_states

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel` ¶

Bases: MSMambaPreTrainedModel

MSMambaModel represents a model for MSMamba that inherits from MSMambaPreTrainedModel.

ATTRIBUTE	DESCRIPTION
`embeddings`	An embedding layer for the model's vocabulary. TYPE: `Embedding`
`layers`	A list of MSMambaBlock layers for the model. TYPE: `ModuleList`
`gradient_checkpointing`	Indicates if gradient checkpointing is enabled. TYPE: `bool`
`norm_f`	Normalization function for the model's hidden states. TYPE: `MSMambaRMSNorm`

METHOD	DESCRIPTION
`__init__`	Initializes the MSMambaModel with the given configuration.
`get_input_embeddings`	Retrieves the input embeddings for the model.
`set_input_embeddings`	Sets new input embeddings for the model.
`forward`	Constructs the model based on the input and configuration parameters.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaModel(MSMambaPreTrainedModel):

    """
    MSMambaModel represents a model for MSMamba that inherits from MSMambaPreTrainedModel.

    Attributes:
        embeddings (nn.Embedding): An embedding layer for the model's vocabulary.
        layers (nn.ModuleList): A list of MSMambaBlock layers for the model.
        gradient_checkpointing (bool): Indicates if gradient checkpointing is enabled.
        norm_f (MSMambaRMSNorm): Normalization function for the model's hidden states.

    Methods:
        __init__: Initializes the MSMambaModel with the given configuration.
        get_input_embeddings: Retrieves the input embeddings for the model.
        set_input_embeddings: Sets new input embeddings for the model.
        forward: Constructs the model based on the input and configuration parameters.
    """
    def __init__(self, config):

        """
        Initializes an instance of MSMambaModel.

        Args:
            self (object): The instance of MSMambaModel.
            config (object): The configuration object containing parameters for the model.
                Must include the following attributes:

                - vocab_size (int): The size of the vocabulary.
                - hidden_size (int): The size of the hidden layers.
                - num_hidden_layers (int): The number of hidden layers.
                - layer_norm_epsilon (float): The epsilon value for layer normalization.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)

        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = nn.ModuleList([MSMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])

        self.gradient_checkpointing = False
        self.norm_f = MSMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):

        """
        Retrieve the input embeddings for the MSMambaModel.

        Args:
            self: The instance of the MSMambaModel class.

        Returns:
            The embeddings associated with the input.

        Raises:
            None.
        """
        return self.embeddings

    def set_input_embeddings(self, new_embeddings):

        """
        Set the input embeddings for the MSMambaModel.

        Args:
            self (MSMambaModel): The instance of the MSMambaModel class.
            new_embeddings (object): The new input embeddings to be set for the MSMambaModel.

        Returns:
            None.

        Raises:
            None.
        """
        self.embeddings = new_embeddings

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        cache_params: Optional[List[mindspore.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # `attention_mask` is passed by the tokenizer and we don't want it
    ) -> Union[Tuple, Dict]:

        '''
        Construct the MSMambaModel.

        Args:
            self (MSMambaModel): The instance of the MSMambaModel.
            input_ids (Optional[mindspore.Tensor]): The input tensor containing the indices of tokens in the
                input sequence. Default is None.
            inputs_embeds (Optional[mindspore.Tensor]): The input tensor for the embeddings. Default is None.
            cache_params (Optional[List[mindspore.Tensor]]): The optional cache parameters for the model. Default is None.
            use_cache (Optional[bool]): Flag to use cache. Default is None.
            output_hidden_states (Optional[bool]): Flag to output hidden states. Default is None.
            return_dict (Optional[bool]): Flag to return a dictionary. Default is None.
            **kwargs: Additional keyword arguments.

        Returns:
            Union[Tuple, Dict]:
                Depending on the value of 'return_dict', it returns either a tuple or a dictionary.

                - If 'return_dict' is False, returns a tuple containing 'hidden_states', 'cache_params',
                and 'all_hidden_states' if not None.
                - If 'return_dict' is True, returns a dictionary with keys 'last_hidden_state', 'cache_params'
                (if 'use_cache' is True), and 'hidden_states'.

        Raises:
            ValueError: If the input_ids and inputs_embeds are both None.
            RuntimeError: If an error occurs during the forwardion process.
            TypeError: If the input_ids or inputs_embeds are not of type mindspore.Tensor.
        '''
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
        #     raise ValueError(
        #         "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
        #     )

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)

        if self.gradient_checkpointing and self.training and use_cache:
            use_cache = False

        if cache_params is None and use_cache:
            cache_params = MSMambaCache(
                self.config, inputs_embeds.shape[0], dtype=inputs_embeds.dtype
            )

        hidden_states = inputs_embeds
        all_hidden_states = () if output_hidden_states else None
        for mixer_block in self.layers:
            hidden_states = mixer_block(hidden_states, cache_params=cache_params)

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        if use_cache:
            ops.assign(cache_params.seqlen_offset, cache_params.seqlen_offset + inputs_embeds.shape[1])

        hidden_states = self.norm_f(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)

        return {
            'last_hidden_state': hidden_states,
            'cache_params': cache_params if use_cache else None,
            'hidden_states': all_hidden_states,
        }

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.init(config)` ¶

Initializes an instance of MSMambaModel.

PARAMETER	DESCRIPTION
`self`	The instance of MSMambaModel. TYPE: `object`
`config`	The configuration object containing parameters for the model. Must include the following attributes: vocab_size (int): The size of the vocabulary. hidden_size (int): The size of the hidden layers. num_hidden_layers (int): The number of hidden layers. layer_norm_epsilon (float): The epsilon value for layer normalization. TYPE: `object`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, config):

    """
    Initializes an instance of MSMambaModel.

    Args:
        self (object): The instance of MSMambaModel.
        config (object): The configuration object containing parameters for the model.
            Must include the following attributes:

            - vocab_size (int): The size of the vocabulary.
            - hidden_size (int): The size of the hidden layers.
            - num_hidden_layers (int): The number of hidden layers.
            - layer_norm_epsilon (float): The epsilon value for layer normalization.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)

    self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
    self.layers = nn.ModuleList([MSMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])

    self.gradient_checkpointing = False
    self.norm_f = MSMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
    # Initialize weights and apply final processing
    self.post_init()

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.forward(input_ids=None, inputs_embeds=None, cache_params=None, use_cache=None, output_hidden_states=None, return_dict=None, **kwargs)` ¶

Construct the MSMambaModel.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaModel. TYPE: `MSMambaModel`
`input_ids`	The input tensor containing the indices of tokens in the input sequence. Default is None. TYPE: `Optional[Tensor]` DEFAULT: `None`
`inputs_embeds`	The input tensor for the embeddings. Default is None. TYPE: `Optional[Tensor]` DEFAULT: `None`
`cache_params`	The optional cache parameters for the model. Default is None. TYPE: `Optional[List[Tensor]]` DEFAULT: `None`
`use_cache`	Flag to use cache. Default is None. TYPE: `Optional[bool]` DEFAULT: `None`
`output_hidden_states`	Flag to output hidden states. Default is None. TYPE: `Optional[bool]` DEFAULT: `None`
`return_dict`	Flag to return a dictionary. Default is None. TYPE: `Optional[bool]` DEFAULT: `None`
`**kwargs`	Additional keyword arguments. DEFAULT: `{}`

RETURNS	DESCRIPTION
`Union[Tuple, Dict]`	Union[Tuple, Dict]: Depending on the value of 'return_dict', it returns either a tuple or a dictionary. If 'return_dict' is False, returns a tuple containing 'hidden_states', 'cache_params', and 'all_hidden_states' if not None. If 'return_dict' is True, returns a dictionary with keys 'last_hidden_state', 'cache_params' (if 'use_cache' is True), and 'hidden_states'.

RAISES	DESCRIPTION
`ValueError`	If the input_ids and inputs_embeds are both None.
`RuntimeError`	If an error occurs during the forwardion process.
`TypeError`	If the input_ids or inputs_embeds are not of type mindspore.Tensor.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    cache_params: Optional[List[mindspore.Tensor]] = None,
    use_cache: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    **kwargs,  # `attention_mask` is passed by the tokenizer and we don't want it
) -> Union[Tuple, Dict]:

    '''
    Construct the MSMambaModel.

    Args:
        self (MSMambaModel): The instance of the MSMambaModel.
        input_ids (Optional[mindspore.Tensor]): The input tensor containing the indices of tokens in the
            input sequence. Default is None.
        inputs_embeds (Optional[mindspore.Tensor]): The input tensor for the embeddings. Default is None.
        cache_params (Optional[List[mindspore.Tensor]]): The optional cache parameters for the model. Default is None.
        use_cache (Optional[bool]): Flag to use cache. Default is None.
        output_hidden_states (Optional[bool]): Flag to output hidden states. Default is None.
        return_dict (Optional[bool]): Flag to return a dictionary. Default is None.
        **kwargs: Additional keyword arguments.

    Returns:
        Union[Tuple, Dict]:
            Depending on the value of 'return_dict', it returns either a tuple or a dictionary.

            - If 'return_dict' is False, returns a tuple containing 'hidden_states', 'cache_params',
            and 'all_hidden_states' if not None.
            - If 'return_dict' is True, returns a dictionary with keys 'last_hidden_state', 'cache_params'
            (if 'use_cache' is True), and 'hidden_states'.

    Raises:
        ValueError: If the input_ids and inputs_embeds are both None.
        RuntimeError: If an error occurs during the forwardion process.
        TypeError: If the input_ids or inputs_embeds are not of type mindspore.Tensor.
    '''
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
    #     raise ValueError(
    #         "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
    #     )

    if inputs_embeds is None:
        inputs_embeds = self.embeddings(input_ids)

    if self.gradient_checkpointing and self.training and use_cache:
        use_cache = False

    if cache_params is None and use_cache:
        cache_params = MSMambaCache(
            self.config, inputs_embeds.shape[0], dtype=inputs_embeds.dtype
        )

    hidden_states = inputs_embeds
    all_hidden_states = () if output_hidden_states else None
    for mixer_block in self.layers:
        hidden_states = mixer_block(hidden_states, cache_params=cache_params)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

    if use_cache:
        ops.assign(cache_params.seqlen_offset, cache_params.seqlen_offset + inputs_embeds.shape[1])

    hidden_states = self.norm_f(hidden_states)

    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

    if not return_dict:
        return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)

    return {
        'last_hidden_state': hidden_states,
        'cache_params': cache_params if use_cache else None,
        'hidden_states': all_hidden_states,
    }

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.get_input_embeddings()` ¶

Retrieve the input embeddings for the MSMambaModel.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaModel class.

RETURNS	DESCRIPTION
	The embeddings associated with the input.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def get_input_embeddings(self):

    """
    Retrieve the input embeddings for the MSMambaModel.

    Args:
        self: The instance of the MSMambaModel class.

    Returns:
        The embeddings associated with the input.

    Raises:
        None.
    """
    return self.embeddings

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.set_input_embeddings(new_embeddings)` ¶

Set the input embeddings for the MSMambaModel.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaModel class. TYPE: `MSMambaModel`
`new_embeddings`	The new input embeddings to be set for the MSMambaModel. TYPE: `object`

RETURNS	DESCRIPTION
	None.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def set_input_embeddings(self, new_embeddings):

    """
    Set the input embeddings for the MSMambaModel.

    Args:
        self (MSMambaModel): The instance of the MSMambaModel class.
        new_embeddings (object): The new input embeddings to be set for the MSMambaModel.

    Returns:
        None.

    Raises:
        None.
    """
    self.embeddings = new_embeddings

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel` ¶

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = MambaConfig
    base_model_prefix = "backbone"
    _no_split_modules = ["MSMambaBlock"]
    supports_gradient_checkpointing = True

    def _init_weights(self, cell):
        """Initialize the weights."""
        if isinstance(cell, MSMambaMixer):
            cell.A_log._no_weight_decay = True
            cell.D._no_weight_decay = True

            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
            if self.config.time_step_init_scheme == "constant":
                cell.dt_proj.weight[:] = dt_init_std
            elif self.config.time_step_init_scheme == "random":
                cell.dt_proj.weight.set_data(initializer(Uniform(dt_init_std), cell.dt_proj.weight.shape, cell.dt_proj.weight.dtype))

            dt = ops.exp(
                ops.rand(self.config.intermediate_size)
                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
                + math.log(self.config.time_step_min)
            ).clamp(min=self.config.time_step_floor)
            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
            inv_dt = dt + ops.log(-ops.expm1(-dt))
            cell.dt_proj.bias[:] = inv_dt
            cell.dt_proj.bias._no_reinit = True

        if isinstance(cell, MambaDense):
            if cell.bias is not None:
                if not getattr(cell.bias, "_no_reinit", False):
                    cell.bias[:] = 0
        elif isinstance(cell, nn.Embedding):
            cell.weight.set_data(initializer(Normal(self.config.initializer_range), cell.weight.shape, cell.weight.dtype))

        if self.config.rescale_prenorm_residual:
            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
            #
            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            for name, p in cell.parameters_and_names():
                if name in ["out_proj.weight"]:
                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                    # Following MindSpore init, except scale by 1/sqrt(2 * n_layer)
                    # We need to reinit p since this code could be called multiple times
                    # Having just p *= scale would repeatedly scale it down
                    p.set_data(initializer(HeUniform(math.sqrt(5)), p.shape, p.dtype) / math.sqrt(self.config.num_layers))

    def __call__(self, *args, **kwargs):

        """
        This method __call__ is defined within the class MSMambaPreTrainedModel and is used to handle the call operation
        when an instance of the class is called as a function.

        Args:
            self: The instance of the MSMambaPreTrainedModel class.

        Returns:
            Conditional returns:

                - If the outputs from the super().__call__(*args, **kwargs) are of type dict, the method returns
                an instance of ADDict(outputs).
                - Otherwise, it returns the outputs as is.

        Raises:
            None.
        """
        outputs = super().__call__(*args, **kwargs)
        if isinstance(outputs, dict):
            return ADDict(outputs)
        return outputs

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel.call(*args, **kwargs)` ¶

This method call is defined within the class MSMambaPreTrainedModel and is used to handle the call operation when an instance of the class is called as a function.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaPreTrainedModel class.

RETURNS	DESCRIPTION
	Conditional returns: If the outputs from the super().call(args, *kwargs) are of type dict, the method returns an instance of ADDict(outputs). Otherwise, it returns the outputs as is.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __call__(self, *args, **kwargs):

    """
    This method __call__ is defined within the class MSMambaPreTrainedModel and is used to handle the call operation
    when an instance of the class is called as a function.

    Args:
        self: The instance of the MSMambaPreTrainedModel class.

    Returns:
        Conditional returns:

            - If the outputs from the super().__call__(*args, **kwargs) are of type dict, the method returns
            an instance of ADDict(outputs).
            - Otherwise, it returns the outputs as is.

    Raises:
        None.
    """
    outputs = super().__call__(*args, **kwargs)
    if isinstance(outputs, dict):
        return ADDict(outputs)
    return outputs

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm` ¶

Bases: Module

MSMambaRMSNorm is a class that represents a modified version of the T5LayerNorm, called LlamaRMSNorm. It is designed to normalize the hidden states of a neural network layer.

This class inherits from nn.Module and provides functionality to normalize the hidden states using a modified RMS normalization technique.

ATTRIBUTE	DESCRIPTION
`weight`	A parameter tensor that stores the weight values for the normalization. TYPE: `Parameter`
`variance_epsilon`	A small value added to the variance to avoid division by zero. TYPE: `float`

METHOD	DESCRIPTION
`__init__`	Initializes an instance of MSMambaRMSNorm.
`forward`	Normalizes the input hidden states using the RMS normalization technique.

Note

The input hidden states are expected to be of shape (batch_size, sequence_length, hidden_size).
The normalization is performed along the last dimension (hidden_size).

Example

>>> hidden_states = ops.random_normal((batch_size, sequence_length, hidden_size))
>>> norm_layer = MSMambaRMSNorm(hidden_size)
>>> normalized_states = norm_layer.forward(hidden_states)

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MSMambaRMSNorm(nn.Module):

    """
    MSMambaRMSNorm is a class that represents a modified version of the T5LayerNorm, called LlamaRMSNorm.
    It is designed to normalize the hidden states of a neural network layer.

    This class inherits from nn.Module and provides functionality to normalize the hidden states using a modified
    RMS normalization technique.

    Attributes:
        weight (Parameter): A parameter tensor that stores the weight values for the normalization.
        variance_epsilon (float): A small value added to the variance to avoid division by zero.

    Methods:
        __init__: Initializes an instance of MSMambaRMSNorm.
        forward: Normalizes the input hidden states using the RMS normalization technique.

    Note:
        - The input hidden states are expected to be of shape (batch_size, sequence_length, hidden_size).
        - The normalization is performed along the last dimension (hidden_size).

    Example:
        ```python
        >>> hidden_states = ops.random_normal((batch_size, sequence_length, hidden_size))
        >>> norm_layer = MSMambaRMSNorm(hidden_size)
        >>> normalized_states = norm_layer.forward(hidden_states)
        ```
    """
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = Parameter(ops.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):

        '''
        Constructs an instance of MSMambaRMSNorm.

        Args:
            self (MSMambaRMSNorm): The instance of the MSMambaRMSNorm class.
            hidden_states (Tensor): The input tensor containing the hidden states.
                It should be of type tensor and have a shape (batch_size, sequence_length, hidden_size).

        Returns:
            None: The method modifies the hidden_states tensor in-place.

        Raises:
            TypeError: If the hidden_states parameter is not of type tensor.
            ValueError: If the hidden_states tensor does not have the expected shape.
        '''
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(mindspore.float32)
        variance = hidden_states.pow(2).mean(-1, keep_dims=True)
        hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.init(hidden_size, eps=1e-06)` ¶

LlamaRMSNorm is equivalent to T5LayerNorm

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def __init__(self, hidden_size, eps=1e-6):
    """
    LlamaRMSNorm is equivalent to T5LayerNorm
    """
    super().__init__()
    self.weight = Parameter(ops.ones(hidden_size))
    self.variance_epsilon = eps

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.forward(hidden_states)` ¶

Constructs an instance of MSMambaRMSNorm.

PARAMETER	DESCRIPTION
`self`	The instance of the MSMambaRMSNorm class. TYPE: `MSMambaRMSNorm`
`hidden_states`	The input tensor containing the hidden states. It should be of type tensor and have a shape (batch_size, sequence_length, hidden_size). TYPE: `Tensor`

RETURNS	DESCRIPTION
`None`	The method modifies the hidden_states tensor in-place.

RAISES	DESCRIPTION
`TypeError`	If the hidden_states parameter is not of type tensor.
`ValueError`	If the hidden_states tensor does not have the expected shape.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def forward(self, hidden_states):

    '''
    Constructs an instance of MSMambaRMSNorm.

    Args:
        self (MSMambaRMSNorm): The instance of the MSMambaRMSNorm class.
        hidden_states (Tensor): The input tensor containing the hidden states.
            It should be of type tensor and have a shape (batch_size, sequence_length, hidden_size).

    Returns:
        None: The method modifies the hidden_states tensor in-place.

    Raises:
        TypeError: If the hidden_states parameter is not of type tensor.
        ValueError: If the hidden_states tensor does not have the expected shape.
    '''
    input_dtype = hidden_states.dtype
    hidden_states = hidden_states.to(mindspore.float32)
    variance = hidden_states.pow(2).mean(-1, keep_dims=True)
    hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)
    return self.weight * hidden_states.to(input_dtype)

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense` ¶

Bases: Linear

MambaDense represents a dense layer in a neural network. It performs matrix multiplication with optional bias addition and reshaping of input data. This class inherits from nn.Linear.

Example

>>> def forward(self, x):
>>>     x_shape = x.shape
>>>     if len(x_shape) != 2:
>>>         x = x.reshape(-1, x.shape[-1])
>>>     x = ops.matmul(x, self.weight.T)
>>>     if self.bias:
>>>         x = ops.add(x, self.bias)
>>>     if len(x_shape) != 2:
>>>         out_shape = x_shape[:-1] + (x.shape[-1], )
>>>         x = x.reshape(out_shape)
>>>     return x

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

class MambaDense(nn.Linear):

    """
    MambaDense represents a dense layer in a neural network.
    It performs matrix multiplication with optional bias addition and reshaping of input data.
    This class inherits from nn.Linear.

    Example:
        ```python
        >>> def forward(self, x):
        >>>     x_shape = x.shape
        >>>     if len(x_shape) != 2:
        >>>         x = x.reshape(-1, x.shape[-1])
        >>>     x = ops.matmul(x, self.weight.T)
        >>>     if self.bias:
        >>>         x = ops.add(x, self.bias)
        >>>     if len(x_shape) != 2:
        >>>         out_shape = x_shape[:-1] + (x.shape[-1], )
        >>>         x = x.reshape(out_shape)
        >>>     return x
        ```
    """
    def forward(self, x):
        """
        Constructs the output of the MambaDense layer by performing matrix multiplication with weights and
        adding bias if applicable.

        Args:
            self (MambaDense): The instance of the MambaDense class.
            x (ndarray): Input data for the layer. Should be a 2D numpy array, but will reshape to 2D if necessary.

        Returns:
            ndarray: The output of the MambaDense layer after matrix multiplication with weights and addition of
                bias if specified.

        Raises:
            ValueError: If the input data x is not a 2D numpy array.
        """
        x_shape = x.shape
        if len(x_shape) != 2:
            x = x.reshape(-1, x.shape[-1])
        x = ops.matmul(x, self.weight.T)
        if self.bias:
            x = ops.add(x, self.bias)
        if len(x_shape) != 2:
            out_shape = x_shape[:-1] + (x.shape[-1],)
            x = x.reshape(out_shape)
        return x

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense.forward(x)` ¶

Constructs the output of the MambaDense layer by performing matrix multiplication with weights and adding bias if applicable.

PARAMETER	DESCRIPTION
`self`	The instance of the MambaDense class. TYPE: `MambaDense`
`x`	Input data for the layer. Should be a 2D numpy array, but will reshape to 2D if necessary. TYPE: `ndarray`

RETURNS	DESCRIPTION
`ndarray`	The output of the MambaDense layer after matrix multiplication with weights and addition of bias if specified.

RAISES	DESCRIPTION
`ValueError`	If the input data x is not a 2D numpy array.

Source code in mindnlp\transformers\models\mamba\modeling_graph_mamba.py

def forward(self, x):
    """
    Constructs the output of the MambaDense layer by performing matrix multiplication with weights and
    adding bias if applicable.

    Args:
        self (MambaDense): The instance of the MambaDense class.
        x (ndarray): Input data for the layer. Should be a 2D numpy array, but will reshape to 2D if necessary.

    Returns:
        ndarray: The output of the MambaDense layer after matrix multiplication with weights and addition of
            bias if specified.

    Raises:
        ValueError: If the input data x is not a 2D numpy array.
    """
    x_shape = x.shape
    if len(x_shape) != 2:
        x = x.reshape(-1, x.shape[-1])
    x = ops.matmul(x, self.weight.T)
    if self.bias:
        x = ops.add(x, self.bias)
    if len(x_shape) != 2:
        out_shape = x_shape[:-1] + (x.shape[-1],)
        x = x.reshape(out_shape)
    return x

mamba

mindnlp.transformers.models.mamba.modeling_mamba ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaCausalLMOutput dataclass ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, use_cache=None, cache_position=None, **kwargs) ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaMixer ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaOutput dataclass ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaPreTrainedModel ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm ¶

mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm.__init__(hidden_size, eps=1e-06) ¶

mindnlp.transformers.models.mamba.configuration_mamba ¶

mindnlp.transformers.models.mamba.configuration_mamba.MambaConfig ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.__init__(config, layer_idx) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.forward(hidden_states, cache_params=None) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache.__init__(config, batch_size, dtype=mindspore.float16) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.__init__(config) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.forward(input_ids=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, **kwargs) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_input_embeddings() ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_output_embeddings() ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.prepare_inputs_for_generation(input_ids, cache_params=None, inputs_embeds=None, **kwargs) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_input_embeddings(new_embeddings) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_output_embeddings(new_embeddings) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.__init__(config, layer_idx) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.forward(input_states, cache_params=None) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.__init__(config) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.forward(input_ids=None, inputs_embeds=None, cache_params=None, use_cache=None, output_hidden_states=None, return_dict=None, **kwargs) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.get_input_embeddings() ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.set_input_embeddings(new_embeddings) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel.__call__(*args, **kwargs) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.__init__(hidden_size, eps=1e-06) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.forward(hidden_states) ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense ¶

mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense.forward(x) ¶

`mindnlp.transformers.models.mamba.modeling_mamba` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaCausalLMOutput` `dataclass` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, use_cache=None, cache_position=None, **kwargs)` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaMixer` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaOutput` `dataclass` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaPreTrainedModel` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm` ¶

`mindnlp.transformers.models.mamba.modeling_mamba.MambaRMSNorm.init(hidden_size, eps=1e-06)` ¶

`mindnlp.transformers.models.mamba.configuration_mamba` ¶

`mindnlp.transformers.models.mamba.configuration_mamba.MambaConfig` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.init(config, layer_idx)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaBlock.forward(hidden_states, cache_params=None)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaCache.init(config, batch_size, dtype=mindspore.float16)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.init(config)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.forward(input_ids=None, inputs_embeds=None, cache_params=None, labels=None, output_hidden_states=None, return_dict=None, **kwargs)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_input_embeddings()` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.get_output_embeddings()` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.prepare_inputs_for_generation(input_ids, cache_params=None, inputs_embeds=None, **kwargs)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_input_embeddings(new_embeddings)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaForCausalLM.set_output_embeddings(new_embeddings)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.init(config, layer_idx)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaMixer.forward(input_states, cache_params=None)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.init(config)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.forward(input_ids=None, inputs_embeds=None, cache_params=None, use_cache=None, output_hidden_states=None, return_dict=None, **kwargs)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.get_input_embeddings()` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaModel.set_input_embeddings(new_embeddings)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaPreTrainedModel.call(*args, **kwargs)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.init(hidden_size, eps=1e-06)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MSMambaRMSNorm.forward(hidden_states)` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense` ¶

`mindnlp.transformers.models.mamba.modeling_graph_mamba.MambaDense.forward(x)` ¶