ngpus: 4 type: aligned gradient_accumulation_steps: 2 tokenizer: tokens: 50257 model: gpt2 training: batch_size: 128 accum: ${gradient_accumulation_steps} n_iters: 1250000 snapshot_freq: 10000 log_freq: 500 eval_freq: 10000 snapshot_freq_for_preemption: 3000 snapshot_sampling: true ema: 0.9999 warmup_iter: -1 loss_type: hybrid epsilon: 0.0 lambda: 0.0 data: train: openwebtext-train valid: wikitext103 cache_dir: /home/toolkit/research-diffcodegen/data debug: false graph: type: absorb gamma: 1.0 file: /home/toolkit/research-diffcodegen/data report_all: false expanded_sigma: true noise: type: loglinear sigma_min: 0.0001 sigma_max: 2.0 ar_diffusion: false expanded_sigma: ${graph.expanded_sigma} sampling: predictor: analytic steps_per_level: 1 noise_removal: true strategy: direct strategy_param: 0.9 annealing: type: none efficient: false width: 1024 tau: 1024 eval_tau: 1024 steps_per_level: ${sampling.steps_per_level} sampling_method: sdlm diffusion_loss_weight: 1.0 ce_loss_weight: 1.0 sampling_eps: 0.0001 attention: context_type: block_causal block_type: full match_inference: false eval: batch_size: 16 perplexity: true perplexity_batch_size: 8 optim: weight_decay: 0.1 optimizer: AdamW lr: 0.0002 beta1: 0.9 beta2: 0.95 eps: 1.0e-08 warmup: 10000 grad_clip: 1.0 scheduler: cosine experiment: name: MDLM wandb_project: Hybrid-SDLM-ALIGNED model: name: HDLM type: ddit hidden_size: 768 cond_dim: 128 length: 1024 n_blocks: 12 n_heads: 12 dropout: 0.1 scale_by_sigma: false transformer_sigma_conditioning: false hybrid_sigma_embedding: false post_process_logits: false use_timestep_embedding: false model_type: epsilon_hybrid