Upload HDLM model with complete HF integration
Browse files- config.yaml +90 -0
    	
        config.yaml
    ADDED
    
    | @@ -0,0 +1,90 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ngpus: 4
         | 
| 2 | 
            +
            type: aligned
         | 
| 3 | 
            +
            gradient_accumulation_steps: 2
         | 
| 4 | 
            +
            tokenizer:
         | 
| 5 | 
            +
              tokens: 50257
         | 
| 6 | 
            +
              model: gpt2
         | 
| 7 | 
            +
            training:
         | 
| 8 | 
            +
              batch_size: 128
         | 
| 9 | 
            +
              accum: ${gradient_accumulation_steps}
         | 
| 10 | 
            +
              n_iters: 1250000
         | 
| 11 | 
            +
              snapshot_freq: 10000
         | 
| 12 | 
            +
              log_freq: 500
         | 
| 13 | 
            +
              eval_freq: 10000
         | 
| 14 | 
            +
              snapshot_freq_for_preemption: 3000
         | 
| 15 | 
            +
              snapshot_sampling: true
         | 
| 16 | 
            +
              ema: 0.9999
         | 
| 17 | 
            +
              warmup_iter: -1
         | 
| 18 | 
            +
              loss_type: hybrid
         | 
| 19 | 
            +
              epsilon: 0.0
         | 
| 20 | 
            +
              lambda: 0.0
         | 
| 21 | 
            +
            data:
         | 
| 22 | 
            +
              train: openwebtext-train
         | 
| 23 | 
            +
              valid: wikitext103
         | 
| 24 | 
            +
              cache_dir: /home/toolkit/research-diffcodegen/data
         | 
| 25 | 
            +
              debug: false
         | 
| 26 | 
            +
            graph:
         | 
| 27 | 
            +
              type: absorb
         | 
| 28 | 
            +
              gamma: 1.0
         | 
| 29 | 
            +
              file: /home/toolkit/research-diffcodegen/data
         | 
| 30 | 
            +
              report_all: false
         | 
| 31 | 
            +
              expanded_sigma: true
         | 
| 32 | 
            +
            noise:
         | 
| 33 | 
            +
              type: loglinear
         | 
| 34 | 
            +
              sigma_min: 0.0001
         | 
| 35 | 
            +
              sigma_max: 2.0
         | 
| 36 | 
            +
              ar_diffusion: false
         | 
| 37 | 
            +
              expanded_sigma: ${graph.expanded_sigma}
         | 
| 38 | 
            +
            sampling:
         | 
| 39 | 
            +
              predictor: analytic
         | 
| 40 | 
            +
              steps_per_level: 1
         | 
| 41 | 
            +
              noise_removal: true
         | 
| 42 | 
            +
              strategy: direct
         | 
| 43 | 
            +
              strategy_param: 0.9
         | 
| 44 | 
            +
            annealing:
         | 
| 45 | 
            +
              type: none
         | 
| 46 | 
            +
              efficient: false
         | 
| 47 | 
            +
              width: 1024
         | 
| 48 | 
            +
              tau: 1024
         | 
| 49 | 
            +
              eval_tau: 1024
         | 
| 50 | 
            +
              steps_per_level: ${sampling.steps_per_level}
         | 
| 51 | 
            +
              sampling_method: sdlm
         | 
| 52 | 
            +
              diffusion_loss_weight: 1.0
         | 
| 53 | 
            +
              ce_loss_weight: 1.0
         | 
| 54 | 
            +
              sampling_eps: 0.0001
         | 
| 55 | 
            +
              attention:
         | 
| 56 | 
            +
                context_type: block_causal
         | 
| 57 | 
            +
                block_type: full
         | 
| 58 | 
            +
              match_inference: false
         | 
| 59 | 
            +
            eval:
         | 
| 60 | 
            +
              batch_size: 16
         | 
| 61 | 
            +
              perplexity: true
         | 
| 62 | 
            +
              perplexity_batch_size: 8
         | 
| 63 | 
            +
            optim:
         | 
| 64 | 
            +
              weight_decay: 0.1
         | 
| 65 | 
            +
              optimizer: AdamW
         | 
| 66 | 
            +
              lr: 0.0002
         | 
| 67 | 
            +
              beta1: 0.9
         | 
| 68 | 
            +
              beta2: 0.95
         | 
| 69 | 
            +
              eps: 1.0e-08
         | 
| 70 | 
            +
              warmup: 10000
         | 
| 71 | 
            +
              grad_clip: 1.0
         | 
| 72 | 
            +
              scheduler: cosine
         | 
| 73 | 
            +
            experiment:
         | 
| 74 | 
            +
              name: MDLM
         | 
| 75 | 
            +
              wandb_project: Hybrid-SDLM-ALIGNED
         | 
| 76 | 
            +
            model:
         | 
| 77 | 
            +
              name: HDLM
         | 
| 78 | 
            +
              type: ddit
         | 
| 79 | 
            +
              hidden_size: 768
         | 
| 80 | 
            +
              cond_dim: 128
         | 
| 81 | 
            +
              length: 1024
         | 
| 82 | 
            +
              n_blocks: 12
         | 
| 83 | 
            +
              n_heads: 12
         | 
| 84 | 
            +
              dropout: 0.1
         | 
| 85 | 
            +
              scale_by_sigma: false
         | 
| 86 | 
            +
              transformer_sigma_conditioning: false
         | 
| 87 | 
            +
              hybrid_sigma_embedding: false
         | 
| 88 | 
            +
              post_process_logits: false
         | 
| 89 | 
            +
              use_timestep_embedding: false
         | 
| 90 | 
            +
            model_type: epsilon_hybrid
         | 
