tite-2-late-upscale / pl_config.yaml
fschlatt's picture
Upload folder using huggingface_hub
ce84b1e verified
# lightning.pytorch==2.5.2
seed_everything: 42
trainer:
accelerator: auto
strategy: auto
devices: auto
num_nodes: 1
precision: bf16-mixed
callbacks:
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
dirpath: null
filename: null
monitor: null
verbose: false
save_last: null
save_top_k: 1
save_weights_only: false
mode: min
auto_insert_metric_name: true
every_n_train_steps: null
train_time_interval: null
every_n_epochs: null
save_on_train_epoch_end: null
enable_version_counter: true
fast_dev_run: false
max_epochs: null
min_epochs: null
max_steps: 200000
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: 50000
check_val_every_n_epoch: 1
num_sanity_val_steps: null
log_every_n_steps: null
enable_checkpointing: null
enable_progress_bar: false
enable_model_summary: null
accumulate_grad_batches: 2
gradient_clip_val: 1
gradient_clip_algorithm: null
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
model_registry: null
model:
class_path: tite.module.TiteModule
init_args:
model:
class_path: tite.model.TiteForPreTraining
init_args:
config:
class_path: tite.model.TiteConfig
init_args:
vocab_size: 30522
num_hidden_layers: 12
hidden_sizes:
- 768
- 768
- 768
- 1024
- 1024
- 1024
- 1280
- 1280
- 1280
- 1536
- 1536
- 1536
num_attention_heads:
- 12
- 12
- 12
- 16
- 16
- 16
- 20
- 20
- 20
- 24
- 24
- 24
intermediate_sizes:
- 3072
- 3072
- 3072
- 4096
- 4096
- 4096
- 5120
- 5120
- 5120
- 6144
- 6144
- 6144
kernel_sizes:
- null
- null
- null
- 2
- 2
- 2
- 2
- 2
- 2
- 2
- 2
- 2
strides:
- null
- null
- null
- 2
- 2
- 2
- 2
- 2
- 2
- 2
- 2
- 2
dropout_prob: 0.1
max_position_embeddings: 512
initializer_range: 0.02
layer_norm_eps: 1.0e-12
pad_token_id: 0
hidden_act: gelu_pytorch_tanh
absolute_positional_embedding_type: null
relative_positional_embedding_type: rotary
pooling_location: intra
rotary_interleaved: true
norm_location: post
norm_type: layer
pooling_implementation: triton
rope_implementation: eager
positional_embedding_type: null
enhanced_masked_auto_encoding: true
bow_auto_encoding: true
tokenizer:
class_path: tite.model.TiteTokenizer
init_args:
vocab_file: tokenizers/tite/vocab.txt
tokenizer_file: tokenizers/tite/tokenizer.json
do_lower_case: true
unk_token: '[UNK]'
sep_token: '[SEP]'
pad_token: '[PAD]'
cls_token: '[CLS]'
mask_token: '[MASK]'
tokenize_chinese_chars: true
strip_accents: null
dict_kwargs:
model_max_length: 512
validate_on_glue: true
validate_on_trec_dl: true
log_gradients: false
compile: true
data:
class_path: tite.datasets.FineWebDataModule
init_args:
collator:
class_path: tite.datasets.TransformationCollator
init_args:
text_keys:
- text
- null
string_transformations: null
token_transformations:
- class_path: tite.transformation.TokenMask
init_args:
mask_id: 103
mask_prob: 0.3
transformation_prob: 1.0
max_length: 512
path: HuggingFaceFW/fineweb-edu
batch_size: 128
seed: null
num_workers: 8
streaming: true
lr_scheduler:
class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
init_args:
num_warmup_steps: 3000
final_value: 0.02
num_delay_steps: 0
optimizer:
class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
init_args:
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.01
amsgrad: false
maximize: false
foreach: null
capturable: false
differentiable: false
fused: null
ckpt_path: null