| from typing import Dict, List, Optional | |
| from transformers.configuration_utils import PretrainedConfig | |
| class ReneConfig(PretrainedConfig): | |
| r"""Configuration class for the Rene model. | |
| This is the configuration class to store the configuration of a [`ReneLMHeadModel`]. | |
| It is used to instantiate a Rene model according to the specified arguments, | |
| defining the model architecture. Instantiating a configuration with the defaults will yield | |
| a similar configuration to that of the Rene-v0.1-1.3b-pytorch model. | |
| [cartesia-ai/Rene-v0.1-1.3b-pytorch](https://huggingface.co/cartesia-ai/Rene-v0.1-1.3b-pytorch) | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| d_model (`int`, *optional*, defaults to 2048): | |
| Dimension of the hidden representations. | |
| n_layer (`int`, *optional*, defaults to 48): | |
| Number of architecture blocks. | |
| vocab_size (`int`, *optional*, defaults to 50280): | |
| Vocabulary size of the Rene model. Defines the number of different tokens that can be represented by the | |
| `inputs_ids` passed when calling [`ReneModel`]. | |
| ssm_cfg (`dict`, *optional*): | |
| Configuration parameters for the SSM layers. | |
| attn_layer_idx (`List[int]`, *optional*): | |
| Indices of the architecture blocks that should have attention layers. | |
| attn_cfg (`dict`, *optional*): | |
| Configuration parameters for the attention layers. | |
| mlp_layer_idx (`List[int]`, *optional*): | |
| Indices of the architecture blocks that should have MLP layers. | |
| mlp_cfg (`dict`, *optional*): | |
| Configuration parameters for the MLP layers. | |
| rms_norm (`bool`, *optional*, defaults to `True`): | |
| Whether to use RMSNorm (instead of LayerNorm). | |
| residual_in_fp32 (`bool`, *optional*, defaults to `True`): | |
| Whether to keep residual values in fp32. | |
| pad_vocab_size_multiple (`int`, *optional*, defaults to 16): | |
| Pad the vocabulary size up to the next multiple of this value. | |
| tie_word_embeddings (`bool`, *optional*, defaults to `False`): | |
| Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the | |
| model has a output word embedding layer. | |
| pad_token_id (`int`, *optional*, defaults to 1): | |
| The id of the padding token. | |
| bos_token_id (`int`, *optional*): | |
| The id of the "beginning-of-sequence" token. | |
| eos_token_id (`int`, *optional*, defaults to 50279): | |
| The id of the "end-of-sequence" token. | |
| """ | |
| model_type = "rene" | |
| def __init__( | |
| self, | |
| d_model: int = 2048, | |
| n_layer: int = 48, | |
| vocab_size: int = 50280, | |
| ssm_cfg: Optional[Dict] = None, | |
| attn_layer_idx: Optional[List] = None, | |
| attn_cfg: Optional[Dict] = None, | |
| mlp_layer_idx: Optional[List] = None, | |
| mlp_cfg: Optional[Dict] = None, | |
| rms_norm: bool = True, | |
| residual_in_fp32: bool = True, | |
| pad_vocab_size_multiple: int = 16, | |
| tie_word_embeddings: bool = True, | |
| pad_token_id=1, | |
| bos_token_id=None, | |
| eos_token_id=50279, | |
| **kwargs, | |
| ): | |
| if ssm_cfg is None: | |
| ssm_cfg = {} | |
| if attn_layer_idx is None: | |
| attn_layer_idx = [] | |
| if attn_cfg is None: | |
| attn_cfg = {} | |
| if mlp_layer_idx is None: | |
| mlp_layer_idx = [] | |
| if mlp_cfg is None: | |
| mlp_cfg = {} | |
| self.d_model = d_model | |
| self.n_layer = n_layer | |
| self.vocab_size = vocab_size | |
| self.ssm_cfg = ssm_cfg | |
| self.attn_layer_idx = attn_layer_idx | |
| self.attn_cfg = attn_cfg | |
| self.mlp_layer_idx = mlp_layer_idx | |
| self.mlp_cfg = mlp_cfg | |
| self.rms_norm = rms_norm | |
| self.residual_in_fp32 = residual_in_fp32 | |
| self.pad_vocab_size_multiple = pad_vocab_size_multiple | |
| self.tie_word_embeddings = tie_word_embeddings | |
| super().__init__( | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| pad_token_id=pad_token_id, | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs, | |
| ) | |