rwkv

`mindnlp.transformers.models.rwkv.modeling_rwkv` ¶

MindSpore RWKV model.

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvCausalLMOutput` `dataclass` ¶

Bases: ModelOutput

Base class for causal language model (or autoregressive) outputs.

PARAMETER	DESCRIPTION
`loss`	Language modeling loss (for next-token prediction). TYPE: `mindspore.Tensor` of shape `(1,)`, optional, returned when `labels` is provided DEFAULT: `None`
`logits`	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)` DEFAULT: `None`
`state`	The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to avoid providing the old `input_ids`. TYPE: list of five `mindspore.Tensor` of shape `(batch_size, hidden_size, num_hidden_layers)` DEFAULT: `None`
`hidden_states`	Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` DEFAULT: `None`
`attentions`	Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True` DEFAULT: `None`

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

@dataclass
class RwkvCausalLMOutput(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        state (list of five `mindspore.Tensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[mindspore.Tensor] = None
    logits: mindspore.Tensor = None
    state: Optional[List[mindspore.Tensor]] = None
    hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
    attentions: Optional[Tuple[mindspore.Tensor, ...]] = None

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM` ¶

Bases: RwkvPreTrainedModel

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

class RwkvForCausalLM(RwkvPreTrainedModel):
    _tied_weights_keys = ["head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.rwkv = RwkvModel(config)
        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        return self.head

    def set_output_embeddings(self, new_embeddings):
        self.head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, use_cache=None, **kwargs):
        # only last token for inputs_ids if the state is passed along.
        if state is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and state is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs["state"] = state
        model_inputs["use_cache"] = use_cache
        return model_inputs

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,  # noqa
        inputs_embeds: Optional[mindspore.Tensor] = None,
        state: Optional[List[mindspore.Tensor]] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, RwkvCausalLMOutput]:
        r"""
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        rwkv_outputs = self.rwkv(
            input_ids,
            inputs_embeds=inputs_embeds,
            state=state,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = rwkv_outputs[0]

        logits = self.head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

        if not return_dict:
            output = (logits,) + rwkv_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return RwkvCausalLMOutput(
            loss=loss,
            logits=logits,
            state=rwkv_outputs.state,
            hidden_states=rwkv_outputs.hidden_states,
            attentions=rwkv_outputs.attentions,
        )

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, state=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

labels (mindspore.Tensor of shape (batch_size, sequence_length), optional): Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,  # noqa
    inputs_embeds: Optional[mindspore.Tensor] = None,
    state: Optional[List[mindspore.Tensor]] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, RwkvCausalLMOutput]:
    r"""
    labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    rwkv_outputs = self.rwkv(
        input_ids,
        inputs_embeds=inputs_embeds,
        state=state,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = rwkv_outputs[0]

    logits = self.head(hidden_states)

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

    if not return_dict:
        output = (logits,) + rwkv_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return RwkvCausalLMOutput(
        loss=loss,
        logits=logits,
        state=rwkv_outputs.state,
        hidden_states=rwkv_outputs.hidden_states,
        attentions=rwkv_outputs.attentions,
    )

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention` ¶

Bases: Cell

RWKV linear attention

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

class RwkvLinearAttention(Cell):
    """RWKV linear attention"""
    def __init__(self, config):
        """
        Initializes an instance of the RwkvLinearAttention class.

        Args:
            self (RwkvLinearAttention): The instance of the RwkvLinearAttention class.
            config (object): The configuration object containing the context length parameter.
                It is used to set the maximum sequence length and load CUDA kernels.
                Must have the attribute 'context_length' specifying the context length.

        Returns:
            None.

        Raises:
            KeyError: If the 'config' object does not have the 'context_length' attribute.
            RuntimeError: If there is an issue loading the CUDA kernels.
        """
        super().__init__()
        self.max_seq_length = config.context_length
        self.wkv_forward_with_state = load_wkv_cuda_kernel('wkv_forward_with_state', config.context_length)
        self.wkv_forward = load_wkv_cuda_kernel('wkv_forward', config.context_length)

        self.wkv_backward = load_wkv_cuda_kernel('wkv_backward', config.context_length)

    def construct(self, time_decay, time_first, key, value, state=None, return_state=False):
        """
        Constructs the linear attention mechanism for the RwkvLinearAttention class.

        Args:
            self: The instance of the RwkvLinearAttention class.
            time_decay (Union[int, float]): The time decay factor for the attention mechanism.
            time_first (Union[int, float]): The time first factor for the attention mechanism.
            key (Tensor): The input tensor representing the keys for the attention mechanism. 
                The shape of the tensor should be (batch_size, seq_len, hidden_size).
            value (Tensor): The input tensor representing the values for the attention mechanism. 
                The shape of the tensor should be (batch_size, seq_len, hidden_size).
            state (Tensor, optional): The optional input tensor representing the state for the attention mechanism. 
                It has a default value of None. The shape of the tensor should be (batch_size, hidden_size, 3).
            return_state (bool, optional): A flag indicating whether to return the state. 
                It has a default value of False.

        Returns:
            Tuple[Tensor, Tensor]: A tuple containing the output tensor of the attention mechanism 
            and the state tensor if return_state is True. The output tensor represents the result of
            the attention mechanism.
            The state tensor represents the updated state of the attention mechanism if return_state is True.

        Raises:
            ValueError: If the sequence length is greater than the maximum sequence length allowed by the model.
            ValueError: If the product of batch size and hidden size is not a round multiple of the minimum of the
                hidden size and 32.
        """
        batch_size, seq_len, hidden_size = key.shape
        if seq_len > self.max_seq_length:
            raise ValueError(
                f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
                f"{self.max_seq_length} with this model."
            )
        if batch_size * hidden_size % min(hidden_size, 32) != 0:
            raise ValueError(
                f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
                f"multiple of {min(hidden_size, 32)}."
            )

        input_dtype = key.dtype

        time_decay = ops.neg(ops.exp(time_decay.astype(mindspore.float32)))
        if key.dtype == mindspore.float16:
            time_first = time_first.astype(mindspore.float32)
            key = key.astype(mindspore.float32)
            value = value.astype(mindspore.float32)
        # The CUDA kernel will fill this tensor.

        if return_state:
            if state is None:
                state = ops.zeros((batch_size, hidden_size, 3), dtype=mindspore.float32)
                state[:, :, 2] -= 1e38
            else:
                state = ops.cat([s.expand_dims(2) for s in state], dim=2)
            output = self.wkv_forward_with_state(time_decay, time_first, key, value, state)
        else:
            output = self.wkv_forward(time_decay, time_first, key, value)

        if state is not None:
            state = [s.squeeze(2) for s in ops.chunk(state, 3, dim=2)]

        return output.astype(input_dtype), state

    # g stands for grad
    def bprop(self, w, u, k, v, s, return_state, y, gy):
        """bporp for wkv"""
        dtype = k.dtype
        k = k.astype(mindspore.float32)
        v = v.astype(mindspore.float32)
        gy = gy[0].astype(mindspore.float32)
        gw, gu, gk, gv = self.wkv_backward(w, u, k, v, gy)
        gw = ops.sum(gw, 0)
        gu = ops.sum(gu, 0)

        return (gw, gu, gk.astype(dtype), gv.astype(dtype))

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.init(config)` ¶

Initializes an instance of the RwkvLinearAttention class.

PARAMETER	DESCRIPTION
`self`	The instance of the RwkvLinearAttention class. TYPE: `RwkvLinearAttention`
`config`	The configuration object containing the context length parameter. It is used to set the maximum sequence length and load CUDA kernels. Must have the attribute 'context_length' specifying the context length. TYPE: `object`

RETURNS	DESCRIPTION
	None.

RAISES	DESCRIPTION
`KeyError`	If the 'config' object does not have the 'context_length' attribute.
`RuntimeError`	If there is an issue loading the CUDA kernels.

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

def __init__(self, config):
    """
    Initializes an instance of the RwkvLinearAttention class.

    Args:
        self (RwkvLinearAttention): The instance of the RwkvLinearAttention class.
        config (object): The configuration object containing the context length parameter.
            It is used to set the maximum sequence length and load CUDA kernels.
            Must have the attribute 'context_length' specifying the context length.

    Returns:
        None.

    Raises:
        KeyError: If the 'config' object does not have the 'context_length' attribute.
        RuntimeError: If there is an issue loading the CUDA kernels.
    """
    super().__init__()
    self.max_seq_length = config.context_length
    self.wkv_forward_with_state = load_wkv_cuda_kernel('wkv_forward_with_state', config.context_length)
    self.wkv_forward = load_wkv_cuda_kernel('wkv_forward', config.context_length)

    self.wkv_backward = load_wkv_cuda_kernel('wkv_backward', config.context_length)

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.bprop(w, u, k, v, s, return_state, y, gy)` ¶

bporp for wkv

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

def bprop(self, w, u, k, v, s, return_state, y, gy):
    """bporp for wkv"""
    dtype = k.dtype
    k = k.astype(mindspore.float32)
    v = v.astype(mindspore.float32)
    gy = gy[0].astype(mindspore.float32)
    gw, gu, gk, gv = self.wkv_backward(w, u, k, v, gy)
    gw = ops.sum(gw, 0)
    gu = ops.sum(gu, 0)

    return (gw, gu, gk.astype(dtype), gv.astype(dtype))

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.construct(time_decay, time_first, key, value, state=None, return_state=False)` ¶

Constructs the linear attention mechanism for the RwkvLinearAttention class.

PARAMETER	DESCRIPTION
`self`	The instance of the RwkvLinearAttention class.
`time_decay`	The time decay factor for the attention mechanism. TYPE: `Union[int, float]`
`time_first`	The time first factor for the attention mechanism. TYPE: `Union[int, float]`
`key`	The input tensor representing the keys for the attention mechanism. The shape of the tensor should be (batch_size, seq_len, hidden_size). TYPE: `Tensor`
`value`	The input tensor representing the values for the attention mechanism. The shape of the tensor should be (batch_size, seq_len, hidden_size). TYPE: `Tensor`
`state`	The optional input tensor representing the state for the attention mechanism. It has a default value of None. The shape of the tensor should be (batch_size, hidden_size, 3). TYPE: `Tensor` DEFAULT: `None`
`return_state`	A flag indicating whether to return the state. It has a default value of False. TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
	Tuple[Tensor, Tensor]: A tuple containing the output tensor of the attention mechanism
	and the state tensor if return_state is True. The output tensor represents the result of
	the attention mechanism.
	The state tensor represents the updated state of the attention mechanism if return_state is True.

RAISES	DESCRIPTION
`ValueError`	If the sequence length is greater than the maximum sequence length allowed by the model.
`ValueError`	If the product of batch size and hidden size is not a round multiple of the minimum of the hidden size and 32.

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

def construct(self, time_decay, time_first, key, value, state=None, return_state=False):
    """
    Constructs the linear attention mechanism for the RwkvLinearAttention class.

    Args:
        self: The instance of the RwkvLinearAttention class.
        time_decay (Union[int, float]): The time decay factor for the attention mechanism.
        time_first (Union[int, float]): The time first factor for the attention mechanism.
        key (Tensor): The input tensor representing the keys for the attention mechanism. 
            The shape of the tensor should be (batch_size, seq_len, hidden_size).
        value (Tensor): The input tensor representing the values for the attention mechanism. 
            The shape of the tensor should be (batch_size, seq_len, hidden_size).
        state (Tensor, optional): The optional input tensor representing the state for the attention mechanism. 
            It has a default value of None. The shape of the tensor should be (batch_size, hidden_size, 3).
        return_state (bool, optional): A flag indicating whether to return the state. 
            It has a default value of False.

    Returns:
        Tuple[Tensor, Tensor]: A tuple containing the output tensor of the attention mechanism 
        and the state tensor if return_state is True. The output tensor represents the result of
        the attention mechanism.
        The state tensor represents the updated state of the attention mechanism if return_state is True.

    Raises:
        ValueError: If the sequence length is greater than the maximum sequence length allowed by the model.
        ValueError: If the product of batch size and hidden size is not a round multiple of the minimum of the
            hidden size and 32.
    """
    batch_size, seq_len, hidden_size = key.shape
    if seq_len > self.max_seq_length:
        raise ValueError(
            f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
            f"{self.max_seq_length} with this model."
        )
    if batch_size * hidden_size % min(hidden_size, 32) != 0:
        raise ValueError(
            f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
            f"multiple of {min(hidden_size, 32)}."
        )

    input_dtype = key.dtype

    time_decay = ops.neg(ops.exp(time_decay.astype(mindspore.float32)))
    if key.dtype == mindspore.float16:
        time_first = time_first.astype(mindspore.float32)
        key = key.astype(mindspore.float32)
        value = value.astype(mindspore.float32)
    # The CUDA kernel will fill this tensor.

    if return_state:
        if state is None:
            state = ops.zeros((batch_size, hidden_size, 3), dtype=mindspore.float32)
            state[:, :, 2] -= 1e38
        else:
            state = ops.cat([s.expand_dims(2) for s in state], dim=2)
        output = self.wkv_forward_with_state(time_decay, time_first, key, value, state)
    else:
        output = self.wkv_forward(time_decay, time_first, key, value)

    if state is not None:
        state = [s.squeeze(2) for s in ops.chunk(state, 3, dim=2)]

    return output.astype(input_dtype), state

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvOutput` `dataclass` ¶

Bases: ModelOutput

Class for the RWKV model outputs.

PARAMETER	DESCRIPTION
`last_hidden_state`	Sequence of hidden-states at the output of the last layer of the model. TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)` DEFAULT: `None`
`state`	The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to avoid providing the old `input_ids`. TYPE: list of five `mindspore.Tensor` of shape `(batch_size, hidden_size, num_hidden_layers)` DEFAULT: `None`
`hidden_states`	Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` DEFAULT: `None`
`attentions`	Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. TYPE: `tuple(mindspore.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True` DEFAULT: `None`

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

@dataclass
class RwkvOutput(ModelOutput):
    """
    Class for the RWKV model outputs.

    Args:
        last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        state (list of five `mindspore.Tensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: mindspore.Tensor = None
    state: Optional[List[mindspore.Tensor]] = None
    hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
    attentions: Optional[Tuple[mindspore.Tensor, ...]] = None

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvPreTrainedModel` ¶

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

class RwkvPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = RwkvConfig
    base_model_prefix = "rwkv"
    _no_split_modules = ["RwkvBlock"]
    _keep_in_fp32_modules = ["time_decay", "time_first"]
    supports_gradient_checkpointing = True
    _is_stateful = True

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, RwkvSelfAttention):
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size
            attention_hidden_size = module.attention_hidden_size

            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0

            time_weight = mindspore.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
            )
            time_weight = time_weight[None, None, :]

            decay_speed = [
                -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
                for h in range(attention_hidden_size)
            ]
            decay_speed = mindspore.tensor(decay_speed, dtype=module.time_decay.dtype)
            zigzag = (
                mindspore.tensor(
                    [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
                    dtype=module.time_first.dtype,
                )
                * 0.5
            )

            with no_grad():
                ops.assign(module.time_decay, decay_speed)
                ops.assign(module.time_first, ops.ones_like(module.time_first * math.log(0.3) + zigzag))

                ops.assign(module.time_mix_key, ops.pow(time_weight, ratio_1_to_almost0))
                ops.assign(module.time_mix_value, ops.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)
                ops.assign(module.time_mix_receptance, ops.pow(time_weight, 0.5 * ratio_1_to_almost0))
        elif isinstance(module, RwkvFeedForward):
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size

            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0

            time_weight = mindspore.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
            )
            time_weight = time_weight[None, None, :]

            with no_grad():
                ops.assign(module.time_mix_key, ops.pow(time_weight, ratio_1_to_almost0))
                ops.assign(module.time_mix_receptance, ops.pow(time_weight, ratio_1_to_almost0))

`mindnlp.transformers.models.rwkv.modeling_rwkv.load_wkv_cuda_kernel(func_name, context_length)` ¶

load wkv cuda kernel

Source code in mindnlp\transformers\models\rwkv\modeling_rwkv.py

def load_wkv_cuda_kernel(func_name, context_length):
    """load wkv cuda kernel"""
    device_target = mindspore.get_context('device_target')
    if device_target != 'GPU':
        raise RuntimeError('WKV operator only support GPU currently.')

    logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")

    from ...kernel_utils import compile_kernel
    so_path = compile_kernel(kernel_name="wkv", Tmax=context_length)
    wkv_op = Custom(
        str(so_path) + ':' + func_name,
        out_shape=WKV_SHAPE_INFER[func_name],
        out_dtype=WKV_DTYPE_INFER[func_name],
        func_type='aot'
    )
    wkv_op.add_prim_attr('primitive_target', device_target)
    return wkv_op

`mindnlp.transformers.models.rwkv.configuration_rwkv` ¶

RWKV configuration

`mindnlp.transformers.models.rwkv.configuration_rwkv.RwkvConfig` ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [RwkvModel]. It is used to instantiate a RWKV model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the RWVK-4 RWKV/rwkv-4-169m-pile architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER	DESCRIPTION
`vocab_size`	Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`RwkvModel`]. TYPE: `int`, optional, defaults to 50277 DEFAULT: `50277`
`context_length`	The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode lets use any sequence length). TYPE: `int`, optional, defaults to 1024 DEFAULT: `1024`
`hidden_size`	Dimensionality of the embeddings and hidden states. TYPE: `int`, optional, defaults to 4096 DEFAULT: `4096`
`num_hidden_layers`	Number of hidden layers in the model. TYPE: `int`, optional, defaults to 32 DEFAULT: `32`
`attention_hidden_size`	Dimensionality of the attention hidden states. Will default to `hidden_size` if unset. TYPE: `int`, optional DEFAULT: `None`
`intermediate_size`	Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset. TYPE: `int`, optional DEFAULT: `None`
`layer_norm_eps`	The epsilon to use in the layer normalization layers. TYPE: `float`, optional, defaults to 1e-5
`bos_token_id`	The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as GPTNeoX. TYPE: `int`, optional, defaults to 0 DEFAULT: `0`
`eos_token_id`	The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as GPTNeoX. TYPE: `int`, optional, defaults to 0 DEFAULT: `0`
`rescale_every`	At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every `rescale_every` layer. If set to 0 or a negative number, no rescale is done. TYPE: `int`, optional, default to 6 DEFAULT: `6`
`tie_word_embeddings`	Whether or not to tie the word embeddings with the input token embeddings. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`use_cache`	Whether or not the model should return the last state. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`

Example

>>> from transformers import RwkvConfig, RwkvModel
...
>>> # Initializing a Rwkv configuration
>>> configuration = RwkvConfig()
...
>>> # Initializing a model (with random weights) from the configuration
>>> model = RwkvModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in mindnlp\transformers\models\rwkv\configuration_rwkv.py

class RwkvConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the RWVK-4
    [RWKV/rwkv-4-169m-pile](https://hf-mirror.com/RWKV/rwkv-4-169m-pile) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50277):
            Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`RwkvModel`].
        context_length (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
            lets use any sequence length).
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimensionality of the embeddings and hidden states.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the model.
        attention_hidden_size (`int`, *optional*):
            Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
        intermediate_size (`int`, *optional*):
            Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        bos_token_id (`int`, *optional*, defaults to 0):
            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
            as GPTNeoX.
        eos_token_id (`int`, *optional*, defaults to 0):
            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
            GPTNeoX.
        rescale_every (`int`, *optional*, default to 6):
            At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
            `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether or not to tie the word embeddings with the input token embeddings.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last state.


    Example:
        ```python
        >>> from transformers import RwkvConfig, RwkvModel
        ...
        >>> # Initializing a Rwkv configuration
        >>> configuration = RwkvConfig()
        ...
        >>> # Initializing a model (with random weights) from the configuration
        >>> model = RwkvModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "rwkv"
    attribute_map = {"max_position_embeddings": "context_length"}
    pretrained_config_archive_map = RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP

    def __init__(
        self,
        vocab_size=50277,
        context_length=1024,
        hidden_size=4096,
        num_hidden_layers=32,
        attention_hidden_size=None,
        intermediate_size=None,
        layer_norm_epsilon=1e-5,
        bos_token_id=0,
        eos_token_id=0,
        rescale_every=6,
        tie_word_embeddings=False,
        use_cache=True,
        **kwargs,
    ):
        """
        Initializes an instance of RwkvConfig.

        Args:
            self: The instance itself.
            vocab_size (int): The size of the vocabulary. Default is 50277.
            context_length (int): The length of the context. Default is 1024.
            hidden_size (int): The size of the hidden layers. Default is 4096.
            num_hidden_layers (int): The number of hidden layers. Default is 32.
            attention_hidden_size (int, optional): The size of the attention hidden layer.
                Defaults to hidden_size if not provided.
            intermediate_size (int, optional): The size of the intermediate layer. Defaults to 4 times hidden_size
                if not provided.
            layer_norm_epsilon (float): The epsilon value for layer normalization. Default is 1e-05.
            bos_token_id (int): The beginning of sentence token id. Default is 0.
            eos_token_id (int): The end of sentence token id. Default is 0.
            rescale_every (int): The frequency of rescaling. Default is 6.
            tie_word_embeddings (bool): Whether to tie word embeddings. Default is False.
            use_cache (bool): Whether to use cache. Default is True.

        Returns:
            None.

        Raises:
            ValueError: If the provided vocab_size, context_length, hidden_size, num_hidden_layers,
                attention_hidden_size, intermediate_size, layer_norm_epsilon, bos_token_id, eos_token_id,
                or rescale_every is not a positive integer.
            TypeError: If any of the provided parameters has an unexpected type.
        """
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
        self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
        self.layer_norm_epsilon = layer_norm_epsilon
        self.rescale_every = rescale_every
        self.use_cache = use_cache

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        super().__init__(
            tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
        )

`mindnlp.transformers.models.rwkv.configuration_rwkv.RwkvConfig.init(vocab_size=50277, context_length=1024, hidden_size=4096, num_hidden_layers=32, attention_hidden_size=None, intermediate_size=None, layer_norm_epsilon=1e-05, bos_token_id=0, eos_token_id=0, rescale_every=6, tie_word_embeddings=False, use_cache=True, **kwargs)` ¶

Initializes an instance of RwkvConfig.

PARAMETER	DESCRIPTION
`self`	The instance itself.
`vocab_size`	The size of the vocabulary. Default is 50277. TYPE: `int` DEFAULT: `50277`
`context_length`	The length of the context. Default is 1024. TYPE: `int` DEFAULT: `1024`
`hidden_size`	The size of the hidden layers. Default is 4096. TYPE: `int` DEFAULT: `4096`
`num_hidden_layers`	The number of hidden layers. Default is 32. TYPE: `int` DEFAULT: `32`
`attention_hidden_size`	The size of the attention hidden layer. Defaults to hidden_size if not provided. TYPE: `int` DEFAULT: `None`
`intermediate_size`	The size of the intermediate layer. Defaults to 4 times hidden_size if not provided. TYPE: `int` DEFAULT: `None`
`layer_norm_epsilon`	The epsilon value for layer normalization. Default is 1e-05. TYPE: `float` DEFAULT: `1e-05`
`bos_token_id`	The beginning of sentence token id. Default is 0. TYPE: `int` DEFAULT: `0`
`eos_token_id`	The end of sentence token id. Default is 0. TYPE: `int` DEFAULT: `0`
`rescale_every`	The frequency of rescaling. Default is 6. TYPE: `int` DEFAULT: `6`
`tie_word_embeddings`	Whether to tie word embeddings. Default is False. TYPE: `bool` DEFAULT: `False`
`use_cache`	Whether to use cache. Default is True. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
	None.

RAISES	DESCRIPTION
`ValueError`	If the provided vocab_size, context_length, hidden_size, num_hidden_layers, attention_hidden_size, intermediate_size, layer_norm_epsilon, bos_token_id, eos_token_id, or rescale_every is not a positive integer.
`TypeError`	If any of the provided parameters has an unexpected type.

Source code in mindnlp\transformers\models\rwkv\configuration_rwkv.py

def __init__(
    self,
    vocab_size=50277,
    context_length=1024,
    hidden_size=4096,
    num_hidden_layers=32,
    attention_hidden_size=None,
    intermediate_size=None,
    layer_norm_epsilon=1e-5,
    bos_token_id=0,
    eos_token_id=0,
    rescale_every=6,
    tie_word_embeddings=False,
    use_cache=True,
    **kwargs,
):
    """
    Initializes an instance of RwkvConfig.

    Args:
        self: The instance itself.
        vocab_size (int): The size of the vocabulary. Default is 50277.
        context_length (int): The length of the context. Default is 1024.
        hidden_size (int): The size of the hidden layers. Default is 4096.
        num_hidden_layers (int): The number of hidden layers. Default is 32.
        attention_hidden_size (int, optional): The size of the attention hidden layer.
            Defaults to hidden_size if not provided.
        intermediate_size (int, optional): The size of the intermediate layer. Defaults to 4 times hidden_size
            if not provided.
        layer_norm_epsilon (float): The epsilon value for layer normalization. Default is 1e-05.
        bos_token_id (int): The beginning of sentence token id. Default is 0.
        eos_token_id (int): The end of sentence token id. Default is 0.
        rescale_every (int): The frequency of rescaling. Default is 6.
        tie_word_embeddings (bool): Whether to tie word embeddings. Default is False.
        use_cache (bool): Whether to use cache. Default is True.

    Returns:
        None.

    Raises:
        ValueError: If the provided vocab_size, context_length, hidden_size, num_hidden_layers,
            attention_hidden_size, intermediate_size, layer_norm_epsilon, bos_token_id, eos_token_id,
            or rescale_every is not a positive integer.
        TypeError: If any of the provided parameters has an unexpected type.
    """
    self.vocab_size = vocab_size
    self.context_length = context_length
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
    self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
    self.layer_norm_epsilon = layer_norm_epsilon
    self.rescale_every = rescale_every
    self.use_cache = use_cache

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    super().__init__(
        tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
    )

rwkv

mindnlp.transformers.models.rwkv.modeling_rwkv ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvCausalLMOutput dataclass ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, state=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None) ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.__init__(config) ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.bprop(w, u, k, v, s, return_state, y, gy) ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.construct(time_decay, time_first, key, value, state=None, return_state=False) ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvOutput dataclass ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvPreTrainedModel ¶

mindnlp.transformers.models.rwkv.modeling_rwkv.load_wkv_cuda_kernel(func_name, context_length) ¶

mindnlp.transformers.models.rwkv.configuration_rwkv ¶

mindnlp.transformers.models.rwkv.configuration_rwkv.RwkvConfig ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvCausalLMOutput` `dataclass` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvForCausalLM.forward(input_ids=None, attention_mask=None, inputs_embeds=None, state=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.init(config)` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.bprop(w, u, k, v, s, return_state, y, gy)` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvLinearAttention.construct(time_decay, time_first, key, value, state=None, return_state=False)` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvOutput` `dataclass` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.RwkvPreTrainedModel` ¶

`mindnlp.transformers.models.rwkv.modeling_rwkv.load_wkv_cuda_kernel(func_name, context_length)` ¶

`mindnlp.transformers.models.rwkv.configuration_rwkv` ¶

`mindnlp.transformers.models.rwkv.configuration_rwkv.RwkvConfig` ¶