ngpus: 4
type: aligned
gradient_accumulation_steps: 2
tokenizer:
  tokens: 50257
  model: gpt2
training:
  batch_size: 128
  accum: ${gradient_accumulation_steps}
  n_iters: 1250000
  snapshot_freq: 10000
  log_freq: 500
  eval_freq: 10000
  snapshot_freq_for_preemption: 3000
  snapshot_sampling: true
  ema: 0.9999
  warmup_iter: -1
  loss_type: hybrid
  epsilon: 0.0
  lambda: 0.0
data:
  train: openwebtext-train
  valid: wikitext103
  cache_dir: /home/toolkit/research-diffcodegen/data
  debug: false
graph:
  type: absorb
  gamma: 1.0
  file: /home/toolkit/research-diffcodegen/data
  report_all: false
  expanded_sigma: true
noise:
  type: loglinear
  sigma_min: 0.0001
  sigma_max: 2.0
  ar_diffusion: false
  expanded_sigma: ${graph.expanded_sigma}
sampling:
  predictor: analytic
  steps_per_level: 1
  noise_removal: true
  strategy: direct
  strategy_param: 0.9
annealing:
  type: none
  efficient: false
  width: 1024
  tau: 1024
  eval_tau: 1024
  steps_per_level: ${sampling.steps_per_level}
  sampling_method: sdlm
  diffusion_loss_weight: 1.0
  ce_loss_weight: 1.0
  sampling_eps: 0.0001
  attention:
    context_type: block_causal
    block_type: full
  match_inference: false
eval:
  batch_size: 16
  perplexity: true
  perplexity_batch_size: 8
optim:
  weight_decay: 0.1
  optimizer: AdamW
  lr: 0.0002
  beta1: 0.9
  beta2: 0.95
  eps: 1.0e-08
  warmup: 10000
  grad_clip: 1.0
  scheduler: cosine
experiment:
  name: MDLM
  wandb_project: Hybrid-SDLM-ALIGNED
model:
  name: HDLM
  type: ddit
  hidden_size: 768
  cond_dim: 128
  length: 1024
  n_blocks: 12
  n_heads: 12
  dropout: 0.1
  scale_by_sigma: false
  transformer_sigma_conditioning: false
  hybrid_sigma_embedding: false
  post_process_logits: false
  use_timestep_embedding: false
model_type: epsilon_hybrid