|
|
|
|
|
seed_everything: 42 |
|
|
trainer: |
|
|
accelerator: auto |
|
|
strategy: auto |
|
|
devices: auto |
|
|
num_nodes: 1 |
|
|
precision: bf16-mixed |
|
|
callbacks: |
|
|
- class_path: lightning.pytorch.callbacks.ModelCheckpoint |
|
|
init_args: |
|
|
dirpath: null |
|
|
filename: null |
|
|
monitor: null |
|
|
verbose: false |
|
|
save_last: null |
|
|
save_top_k: 1 |
|
|
save_weights_only: false |
|
|
mode: min |
|
|
auto_insert_metric_name: true |
|
|
every_n_train_steps: null |
|
|
train_time_interval: null |
|
|
every_n_epochs: null |
|
|
save_on_train_epoch_end: null |
|
|
enable_version_counter: true |
|
|
fast_dev_run: false |
|
|
max_epochs: null |
|
|
min_epochs: null |
|
|
max_steps: 200000 |
|
|
min_steps: null |
|
|
max_time: null |
|
|
limit_train_batches: null |
|
|
limit_val_batches: null |
|
|
limit_test_batches: null |
|
|
limit_predict_batches: null |
|
|
overfit_batches: 0.0 |
|
|
val_check_interval: 50000 |
|
|
check_val_every_n_epoch: 1 |
|
|
num_sanity_val_steps: null |
|
|
log_every_n_steps: null |
|
|
enable_checkpointing: null |
|
|
enable_progress_bar: false |
|
|
enable_model_summary: null |
|
|
accumulate_grad_batches: 2 |
|
|
gradient_clip_val: 1 |
|
|
gradient_clip_algorithm: null |
|
|
deterministic: null |
|
|
benchmark: null |
|
|
inference_mode: true |
|
|
use_distributed_sampler: true |
|
|
profiler: null |
|
|
detect_anomaly: false |
|
|
barebones: false |
|
|
plugins: null |
|
|
sync_batchnorm: false |
|
|
reload_dataloaders_every_n_epochs: 0 |
|
|
default_root_dir: null |
|
|
model_registry: null |
|
|
model: |
|
|
class_path: tite.module.TiteModule |
|
|
init_args: |
|
|
model: |
|
|
class_path: tite.model.TiteForPreTraining |
|
|
init_args: |
|
|
config: |
|
|
class_path: tite.model.TiteConfig |
|
|
init_args: |
|
|
vocab_size: 30522 |
|
|
num_hidden_layers: 12 |
|
|
hidden_sizes: |
|
|
- 768 |
|
|
- 768 |
|
|
- 768 |
|
|
- 1024 |
|
|
- 1024 |
|
|
- 1024 |
|
|
- 1280 |
|
|
- 1280 |
|
|
- 1280 |
|
|
- 1536 |
|
|
- 1536 |
|
|
- 1536 |
|
|
num_attention_heads: |
|
|
- 12 |
|
|
- 12 |
|
|
- 12 |
|
|
- 16 |
|
|
- 16 |
|
|
- 16 |
|
|
- 20 |
|
|
- 20 |
|
|
- 20 |
|
|
- 24 |
|
|
- 24 |
|
|
- 24 |
|
|
intermediate_sizes: |
|
|
- 3072 |
|
|
- 3072 |
|
|
- 3072 |
|
|
- 4096 |
|
|
- 4096 |
|
|
- 4096 |
|
|
- 5120 |
|
|
- 5120 |
|
|
- 5120 |
|
|
- 6144 |
|
|
- 6144 |
|
|
- 6144 |
|
|
kernel_sizes: |
|
|
- null |
|
|
- null |
|
|
- null |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
strides: |
|
|
- null |
|
|
- null |
|
|
- null |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
- 2 |
|
|
dropout_prob: 0.1 |
|
|
max_position_embeddings: 512 |
|
|
initializer_range: 0.02 |
|
|
layer_norm_eps: 1.0e-12 |
|
|
pad_token_id: 0 |
|
|
hidden_act: gelu_pytorch_tanh |
|
|
absolute_positional_embedding_type: null |
|
|
relative_positional_embedding_type: rotary |
|
|
pooling_location: intra |
|
|
rotary_interleaved: true |
|
|
norm_location: post |
|
|
norm_type: layer |
|
|
pooling_implementation: triton |
|
|
rope_implementation: eager |
|
|
positional_embedding_type: null |
|
|
enhanced_masked_auto_encoding: true |
|
|
bow_auto_encoding: true |
|
|
tokenizer: |
|
|
class_path: tite.model.TiteTokenizer |
|
|
init_args: |
|
|
vocab_file: tokenizers/tite/vocab.txt |
|
|
tokenizer_file: tokenizers/tite/tokenizer.json |
|
|
do_lower_case: true |
|
|
unk_token: '[UNK]' |
|
|
sep_token: '[SEP]' |
|
|
pad_token: '[PAD]' |
|
|
cls_token: '[CLS]' |
|
|
mask_token: '[MASK]' |
|
|
tokenize_chinese_chars: true |
|
|
strip_accents: null |
|
|
dict_kwargs: |
|
|
model_max_length: 512 |
|
|
validate_on_glue: true |
|
|
validate_on_trec_dl: true |
|
|
log_gradients: false |
|
|
compile: true |
|
|
data: |
|
|
class_path: tite.datasets.FineWebDataModule |
|
|
init_args: |
|
|
collator: |
|
|
class_path: tite.datasets.TransformationCollator |
|
|
init_args: |
|
|
text_keys: |
|
|
- text |
|
|
- null |
|
|
string_transformations: null |
|
|
token_transformations: |
|
|
- class_path: tite.transformation.TokenMask |
|
|
init_args: |
|
|
mask_id: 103 |
|
|
mask_prob: 0.3 |
|
|
transformation_prob: 1.0 |
|
|
max_length: 512 |
|
|
path: HuggingFaceFW/fineweb-edu |
|
|
batch_size: 128 |
|
|
seed: null |
|
|
num_workers: 8 |
|
|
streaming: true |
|
|
lr_scheduler: |
|
|
class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup |
|
|
init_args: |
|
|
num_warmup_steps: 3000 |
|
|
final_value: 0.02 |
|
|
num_delay_steps: 0 |
|
|
optimizer: |
|
|
class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm |
|
|
init_args: |
|
|
lr: 0.0001 |
|
|
betas: |
|
|
- 0.9 |
|
|
- 0.999 |
|
|
eps: 1.0e-08 |
|
|
weight_decay: 0.01 |
|
|
amsgrad: false |
|
|
maximize: false |
|
|
foreach: null |
|
|
capturable: false |
|
|
differentiable: false |
|
|
fused: null |
|
|
ckpt_path: null |
|
|
|