Spaces:
Runtime error
Runtime error
| # @package __global__ | |
| defaults: | |
| - _self_ | |
| - /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly | |
| lm_model: transformer_lm | |
| codebooks_pattern: | |
| modeling: parallel | |
| transformer_lm: | |
| dim: 512 | |
| num_heads: 8 | |
| num_layers: 8 | |
| hidden_scale: 4 | |
| n_q: 8 # number of streams to model | |
| card: 1024 | |
| dropout: 0. | |
| emb_lr: null | |
| activation: gelu | |
| norm_first: false # use pre-norm instead of post-norm | |
| bias_ff: true # use bias for the feedforward | |
| bias_attn: true # use bias for the attention | |
| bias_proj: true # use bias for the output projections | |
| past_context: null | |
| causal: true | |
| custom: false # use custom MHA implementation | |
| memory_efficient: false # use flash attention | |
| attention_as_float32: false # use float32 for the attention part, | |
| # recommended at the moment when memory_efficient is True. | |
| layer_scale: null | |
| positional_embedding: sin # positional embedding strategy (sin, rope, or sin_rope). | |
| xpos: false # apply xpos decay (rope only). | |
| checkpointing: none # layer checkpointing method, can be none, torch, xformers_default. | |
| # torch is the slowest but uses the least memory, | |
| # xformers_default is somewhere in between. | |
| weight_init: null # weight initialization (null, gaussian or uniform) | |
| depthwise_init: null # perform depthwise initialization (null, current, global) | |
| zero_bias_init: false # initialize bias to zero if bias in linears and | |
| # if a weight_init method is used. | |
| norm: layer_norm # normalization method to use in transformer. | |
| cross_attention: false | |
| qk_layer_norm: false | |
| qk_layer_norm_cross: false | |
| attention_dropout: null | |
| kv_repeat: 1 | |
| two_step_cfg: false # whether to do true 2 steps CFG, potentially resolving some padding issues or not... | |