# lightning.pytorch==2.5.2
seed_everything: 42
trainer:
  accelerator: auto
  strategy: auto
  devices: auto
  num_nodes: 1
  precision: bf16-mixed
  callbacks:
  - class_path: lightning.pytorch.callbacks.ModelCheckpoint
    init_args:
      dirpath: null
      filename: null
      monitor: null
      verbose: false
      save_last: null
      save_top_k: 1
      save_weights_only: false
      mode: min
      auto_insert_metric_name: true
      every_n_train_steps: null
      train_time_interval: null
      every_n_epochs: null
      save_on_train_epoch_end: null
      enable_version_counter: true
  fast_dev_run: false
  max_epochs: null
  min_epochs: null
  max_steps: 200000
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: 50000
  check_val_every_n_epoch: 1
  num_sanity_val_steps: null
  log_every_n_steps: null
  enable_checkpointing: null
  enable_progress_bar: false
  enable_model_summary: null
  accumulate_grad_batches: 2
  gradient_clip_val: 1
  gradient_clip_algorithm: null
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null
  model_registry: null
model:
  class_path: tite.module.TiteModule
  init_args:
    model:
      class_path: tite.model.TiteForPreTraining
      init_args:
        config:
          class_path: tite.model.TiteConfig
          init_args:
            vocab_size: 30522
            num_hidden_layers: 12
            hidden_sizes:
            - 768
            - 768
            - 768
            - 1024
            - 1024
            - 1024
            - 1280
            - 1280
            - 1280
            - 1536
            - 1536
            - 1536
            num_attention_heads:
            - 12
            - 12
            - 12
            - 16
            - 16
            - 16
            - 20
            - 20
            - 20
            - 24
            - 24
            - 24
            intermediate_sizes:
            - 3072
            - 3072
            - 3072
            - 4096
            - 4096
            - 4096
            - 5120
            - 5120
            - 5120
            - 6144
            - 6144
            - 6144
            kernel_sizes:
            - null
            - null
            - null
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            strides:
            - null
            - null
            - null
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            - 2
            dropout_prob: 0.1
            max_position_embeddings: 512
            initializer_range: 0.02
            layer_norm_eps: 1.0e-12
            pad_token_id: 0
            hidden_act: gelu_pytorch_tanh
            absolute_positional_embedding_type: null
            relative_positional_embedding_type: rotary
            pooling_location: intra
            rotary_interleaved: true
            norm_location: post
            norm_type: layer
            pooling_implementation: triton
            rope_implementation: eager
            positional_embedding_type: null
        enhanced_masked_auto_encoding: true
        bow_auto_encoding: true
    tokenizer:
      class_path: tite.model.TiteTokenizer
      init_args:
        vocab_file: tokenizers/tite/vocab.txt
        tokenizer_file: tokenizers/tite/tokenizer.json
        do_lower_case: true
        unk_token: '[UNK]'
        sep_token: '[SEP]'
        pad_token: '[PAD]'
        cls_token: '[CLS]'
        mask_token: '[MASK]'
        tokenize_chinese_chars: true
        strip_accents: null
      dict_kwargs:
        model_max_length: 512
    validate_on_glue: true
    validate_on_trec_dl: true
    log_gradients: false
    compile: true
data:
  class_path: tite.datasets.FineWebDataModule
  init_args:
    collator:
      class_path: tite.datasets.TransformationCollator
      init_args:
        text_keys:
        - text
        - null
        string_transformations: null
        token_transformations:
        - class_path: tite.transformation.TokenMask
          init_args:
            mask_id: 103
            mask_prob: 0.3
            transformation_prob: 1.0
        max_length: 512
    path: HuggingFaceFW/fineweb-edu
    batch_size: 128
    seed: null
    num_workers: 8
    streaming: true
lr_scheduler:
  class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
  init_args:
    num_warmup_steps: 3000
    final_value: 0.02
    num_delay_steps: 0
optimizer:
  class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
  init_args:
    lr: 0.0001
    betas:
    - 0.9
    - 0.999
    eps: 1.0e-08
    weight_decay: 0.01
    amsgrad: false
    maximize: false
    foreach: null
    capturable: false
    differentiable: false
    fused: null
ckpt_path: null