# lightning.pytorch==2.5.2 seed_everything: 42 trainer: accelerator: auto strategy: auto devices: auto num_nodes: 1 precision: bf16-mixed callbacks: - class_path: lightning.pytorch.callbacks.ModelCheckpoint init_args: dirpath: null filename: null monitor: null verbose: false save_last: null save_top_k: 1 save_weights_only: false mode: min auto_insert_metric_name: true every_n_train_steps: null train_time_interval: null every_n_epochs: null save_on_train_epoch_end: null enable_version_counter: true fast_dev_run: false max_epochs: null min_epochs: null max_steps: 200000 min_steps: null max_time: null limit_train_batches: null limit_val_batches: null limit_test_batches: null limit_predict_batches: null overfit_batches: 0.0 val_check_interval: 50000 check_val_every_n_epoch: 1 num_sanity_val_steps: null log_every_n_steps: null enable_checkpointing: null enable_progress_bar: false enable_model_summary: null accumulate_grad_batches: 2 gradient_clip_val: 1 gradient_clip_algorithm: null deterministic: null benchmark: null inference_mode: true use_distributed_sampler: true profiler: null detect_anomaly: false barebones: false plugins: null sync_batchnorm: false reload_dataloaders_every_n_epochs: 0 default_root_dir: null model_registry: null model: class_path: tite.module.TiteModule init_args: model: class_path: tite.model.TiteForPreTraining init_args: config: class_path: tite.model.TiteConfig init_args: vocab_size: 30522 num_hidden_layers: 12 hidden_sizes: - 768 - 768 - 768 - 1024 - 1024 - 1024 - 1280 - 1280 - 1280 - 1536 - 1536 - 1536 num_attention_heads: - 12 - 12 - 12 - 16 - 16 - 16 - 20 - 20 - 20 - 24 - 24 - 24 intermediate_sizes: - 3072 - 3072 - 3072 - 4096 - 4096 - 4096 - 5120 - 5120 - 5120 - 6144 - 6144 - 6144 kernel_sizes: - null - null - null - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 strides: - null - null - null - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 dropout_prob: 0.1 max_position_embeddings: 512 initializer_range: 0.02 layer_norm_eps: 1.0e-12 pad_token_id: 0 hidden_act: gelu_pytorch_tanh absolute_positional_embedding_type: null relative_positional_embedding_type: rotary pooling_location: intra rotary_interleaved: true norm_location: post norm_type: layer pooling_implementation: triton rope_implementation: eager positional_embedding_type: null enhanced_masked_auto_encoding: true bow_auto_encoding: true tokenizer: class_path: tite.model.TiteTokenizer init_args: vocab_file: tokenizers/tite/vocab.txt tokenizer_file: tokenizers/tite/tokenizer.json do_lower_case: true unk_token: '[UNK]' sep_token: '[SEP]' pad_token: '[PAD]' cls_token: '[CLS]' mask_token: '[MASK]' tokenize_chinese_chars: true strip_accents: null dict_kwargs: model_max_length: 512 validate_on_glue: true validate_on_trec_dl: true log_gradients: false compile: true data: class_path: tite.datasets.FineWebDataModule init_args: collator: class_path: tite.datasets.TransformationCollator init_args: text_keys: - text - null string_transformations: null token_transformations: - class_path: tite.transformation.TokenMask init_args: mask_id: 103 mask_prob: 0.3 transformation_prob: 1.0 max_length: 512 path: HuggingFaceFW/fineweb-edu batch_size: 128 seed: null num_workers: 8 streaming: true lr_scheduler: class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup init_args: num_warmup_steps: 3000 final_value: 0.02 num_delay_steps: 0 optimizer: class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm init_args: lr: 0.0001 betas: - 0.9 - 0.999 eps: 1.0e-08 weight_decay: 0.01 amsgrad: false maximize: false foreach: null capturable: false differentiable: false fused: null ckpt_path: null