Model save
Browse files- README.md +1 -1
- all_results.json +6 -6
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- train_results.json +6 -6
- trainer_state.json +361 -436
- training_args.bin +1 -1
    	
        README.md
    CHANGED
    
    | @@ -26,7 +26,7 @@ print(output["generated_text"]) | |
| 26 |  | 
| 27 | 
             
            ## Training procedure
         | 
| 28 |  | 
| 29 | 
            -
            [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/ | 
| 30 |  | 
| 31 |  | 
| 32 | 
             
            This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
         | 
|  | |
| 26 |  | 
| 27 | 
             
            ## Training procedure
         | 
| 28 |  | 
| 29 | 
            +
            [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/d55470xf) 
         | 
| 30 |  | 
| 31 |  | 
| 32 | 
             
            This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
         | 
    	
        all_results.json
    CHANGED
    
    | @@ -1,9 +1,9 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
                "epoch": 1. | 
| 3 | 
             
                "total_flos": 0.0,
         | 
| 4 | 
            -
                "train_loss": 0. | 
| 5 | 
            -
                "train_runtime":  | 
| 6 | 
            -
                "train_samples":  | 
| 7 | 
            -
                "train_samples_per_second": 5. | 
| 8 | 
            -
                "train_steps_per_second": 0. | 
| 9 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
                "epoch": 1.9971305595408895,
         | 
| 3 | 
             
                "total_flos": 0.0,
         | 
| 4 | 
            +
                "train_loss": 0.5929731760901966,
         | 
| 5 | 
            +
                "train_runtime": 2030.2211,
         | 
| 6 | 
            +
                "train_samples": 5576,
         | 
| 7 | 
            +
                "train_samples_per_second": 5.493,
         | 
| 8 | 
            +
                "train_steps_per_second": 0.086
         | 
| 9 | 
             
            }
         | 
    	
        model-00001-of-00004.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 4976698672
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f1b2cfa01d7837330c9b890a79677233bde135efb7b9300dc70ca9c2436cfe2d
         | 
| 3 | 
             
            size 4976698672
         | 
    	
        model-00002-of-00004.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 4999802720
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f1a9f875c8b82e62970bc18ae27be339c4fa9058b0cdb3f537fa10774bf479e9
         | 
| 3 | 
             
            size 4999802720
         | 
    	
        model-00003-of-00004.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 4915916176
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f23e9719d839b39d06c0e5b2276ea0def1e8e6d6774413af2ae74d34fa9ac0a6
         | 
| 3 | 
             
            size 4915916176
         | 
    	
        model-00004-of-00004.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 1168138808
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:643861ba758160b88b3f43351ea8d2c440cf58ff16173e0337f35c74c90cf95b
         | 
| 3 | 
             
            size 1168138808
         | 
    	
        train_results.json
    CHANGED
    
    | @@ -1,9 +1,9 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
                "epoch": 1. | 
| 3 | 
             
                "total_flos": 0.0,
         | 
| 4 | 
            -
                "train_loss": 0. | 
| 5 | 
            -
                "train_runtime":  | 
| 6 | 
            -
                "train_samples":  | 
| 7 | 
            -
                "train_samples_per_second": 5. | 
| 8 | 
            -
                "train_steps_per_second": 0. | 
| 9 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
                "epoch": 1.9971305595408895,
         | 
| 3 | 
             
                "total_flos": 0.0,
         | 
| 4 | 
            +
                "train_loss": 0.5929731760901966,
         | 
| 5 | 
            +
                "train_runtime": 2030.2211,
         | 
| 6 | 
            +
                "train_samples": 5576,
         | 
| 7 | 
            +
                "train_samples_per_second": 5.493,
         | 
| 8 | 
            +
                "train_steps_per_second": 0.086
         | 
| 9 | 
             
            }
         | 
    	
        trainer_state.json
    CHANGED
    
    | @@ -1,610 +1,535 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "best_metric": null,
         | 
| 3 | 
             
              "best_model_checkpoint": null,
         | 
| 4 | 
            -
              "epoch": 1. | 
| 5 | 
             
              "eval_steps": 500,
         | 
| 6 | 
            -
              "global_step":  | 
| 7 | 
             
              "is_hyper_param_search": false,
         | 
| 8 | 
             
              "is_local_process_zero": true,
         | 
| 9 | 
             
              "is_world_process_zero": true,
         | 
| 10 | 
             
              "log_history": [
         | 
| 11 | 
             
                {
         | 
| 12 | 
            -
                  "epoch": 0. | 
| 13 | 
            -
                  "grad_norm":  | 
| 14 | 
            -
                  "learning_rate":  | 
| 15 | 
             
                  "logits/chosen": NaN,
         | 
| 16 | 
             
                  "logits/rejected": NaN,
         | 
| 17 | 
            -
                  "logps/chosen": - | 
| 18 | 
            -
                  "logps/rejected": - | 
| 19 | 
            -
                  "loss": 0. | 
| 20 | 
            -
                  "rewards/accuracies": 0. | 
| 21 | 
            -
                  "rewards/chosen": -0. | 
| 22 | 
            -
                  "rewards/margins":  | 
| 23 | 
            -
                  "rewards/rejected": 0. | 
| 24 | 
             
                  "step": 5
         | 
| 25 | 
             
                },
         | 
| 26 | 
             
                {
         | 
| 27 | 
            -
                  "epoch": 0. | 
| 28 | 
            -
                  "grad_norm":  | 
| 29 | 
            -
                  "learning_rate":  | 
| 30 | 
             
                  "logits/chosen": NaN,
         | 
| 31 | 
             
                  "logits/rejected": NaN,
         | 
| 32 | 
            -
                  "logps/chosen": - | 
| 33 | 
            -
                  "logps/rejected": - | 
| 34 | 
            -
                  "loss": 0. | 
| 35 | 
            -
                  "rewards/accuracies": 0. | 
| 36 | 
            -
                  "rewards/chosen": 0. | 
| 37 | 
            -
                  "rewards/margins": 0. | 
| 38 | 
            -
                  "rewards/rejected": -0. | 
| 39 | 
             
                  "step": 10
         | 
| 40 | 
             
                },
         | 
| 41 | 
             
                {
         | 
| 42 | 
            -
                  "epoch": 0. | 
| 43 | 
            -
                  "grad_norm":  | 
| 44 | 
            -
                  "learning_rate": 1. | 
| 45 | 
             
                  "logits/chosen": NaN,
         | 
| 46 | 
            -
                  "logits/rejected":  | 
| 47 | 
            -
                  "logps/chosen": - | 
| 48 | 
            -
                  "logps/rejected": - | 
| 49 | 
            -
                  "loss": 0. | 
| 50 | 
            -
                  "rewards/accuracies": 0. | 
| 51 | 
            -
                  "rewards/chosen": -0. | 
| 52 | 
            -
                  "rewards/margins": 0. | 
| 53 | 
            -
                  "rewards/rejected": -0. | 
| 54 | 
             
                  "step": 15
         | 
| 55 | 
             
                },
         | 
| 56 | 
             
                {
         | 
| 57 | 
            -
                  "epoch": 0. | 
| 58 | 
            -
                  "grad_norm":  | 
| 59 | 
            -
                  "learning_rate": 1. | 
| 60 | 
             
                  "logits/chosen": NaN,
         | 
| 61 | 
             
                  "logits/rejected": NaN,
         | 
| 62 | 
            -
                  "logps/chosen": - | 
| 63 | 
            -
                  "logps/rejected": - | 
| 64 | 
            -
                  "loss": 0. | 
| 65 | 
            -
                  "rewards/accuracies": 0. | 
| 66 | 
            -
                  "rewards/chosen": -0. | 
| 67 | 
            -
                  "rewards/margins": 0. | 
| 68 | 
            -
                  "rewards/rejected": -0. | 
| 69 | 
             
                  "step": 20
         | 
| 70 | 
             
                },
         | 
| 71 | 
             
                {
         | 
| 72 | 
            -
                  "epoch": 0. | 
| 73 | 
            -
                  "grad_norm":  | 
| 74 | 
            -
                  "learning_rate": 1. | 
| 75 | 
             
                  "logits/chosen": NaN,
         | 
| 76 | 
            -
                  "logits/rejected":  | 
| 77 | 
            -
                  "logps/chosen": - | 
| 78 | 
            -
                  "logps/rejected": - | 
| 79 | 
            -
                  "loss": 0. | 
| 80 | 
            -
                  "rewards/accuracies": 0. | 
| 81 | 
            -
                  "rewards/chosen": -0. | 
| 82 | 
            -
                  "rewards/margins": 0. | 
| 83 | 
            -
                  "rewards/rejected": -0. | 
| 84 | 
             
                  "step": 25
         | 
| 85 | 
             
                },
         | 
| 86 | 
             
                {
         | 
| 87 | 
            -
                  "epoch": 0. | 
| 88 | 
            -
                  "grad_norm":  | 
| 89 | 
            -
                  "learning_rate": 1. | 
| 90 | 
             
                  "logits/chosen": NaN,
         | 
| 91 | 
             
                  "logits/rejected": NaN,
         | 
| 92 | 
            -
                  "logps/chosen": - | 
| 93 | 
            -
                  "logps/rejected": - | 
| 94 | 
            -
                  "loss": 0. | 
| 95 | 
            -
                  "rewards/accuracies": 0. | 
| 96 | 
            -
                  "rewards/chosen": -0. | 
| 97 | 
            -
                  "rewards/margins": 0. | 
| 98 | 
            -
                  "rewards/rejected": -0. | 
| 99 | 
             
                  "step": 30
         | 
| 100 | 
             
                },
         | 
| 101 | 
             
                {
         | 
| 102 | 
            -
                  "epoch": 0. | 
| 103 | 
            -
                  "grad_norm":  | 
| 104 | 
            -
                  "learning_rate": 1. | 
| 105 | 
             
                  "logits/chosen": NaN,
         | 
| 106 | 
             
                  "logits/rejected": NaN,
         | 
| 107 | 
            -
                  "logps/chosen": - | 
| 108 | 
            -
                  "logps/rejected": - | 
| 109 | 
            -
                  "loss": 0. | 
| 110 | 
            -
                  "rewards/accuracies": 0. | 
| 111 | 
            -
                  "rewards/chosen": -0. | 
| 112 | 
            -
                  "rewards/margins": 0. | 
| 113 | 
            -
                  "rewards/rejected": -0. | 
| 114 | 
             
                  "step": 35
         | 
| 115 | 
             
                },
         | 
| 116 | 
             
                {
         | 
| 117 | 
            -
                  "epoch": 0. | 
| 118 | 
            -
                  "grad_norm":  | 
| 119 | 
            -
                  "learning_rate": 1. | 
| 120 | 
             
                  "logits/chosen": NaN,
         | 
| 121 | 
             
                  "logits/rejected": NaN,
         | 
| 122 | 
            -
                  "logps/chosen": - | 
| 123 | 
            -
                  "logps/rejected": - | 
| 124 | 
            -
                  "loss": 0. | 
| 125 | 
            -
                  "rewards/accuracies": 0. | 
| 126 | 
            -
                  "rewards/chosen": -0. | 
| 127 | 
            -
                  "rewards/margins": 0. | 
| 128 | 
            -
                  "rewards/rejected": -0. | 
| 129 | 
             
                  "step": 40
         | 
| 130 | 
             
                },
         | 
| 131 | 
             
                {
         | 
| 132 | 
            -
                  "epoch": 0. | 
| 133 | 
            -
                  "grad_norm":  | 
| 134 | 
            -
                  "learning_rate": 1. | 
| 135 | 
             
                  "logits/chosen": NaN,
         | 
| 136 | 
            -
                  "logits/rejected":  | 
| 137 | 
            -
                  "logps/chosen": - | 
| 138 | 
            -
                  "logps/rejected": - | 
| 139 | 
            -
                  "loss": 0. | 
| 140 | 
            -
                  "rewards/accuracies": 0. | 
| 141 | 
            -
                  "rewards/chosen": -0. | 
| 142 | 
            -
                  "rewards/margins":  | 
| 143 | 
            -
                  "rewards/rejected": - | 
| 144 | 
             
                  "step": 45
         | 
| 145 | 
             
                },
         | 
| 146 | 
             
                {
         | 
| 147 | 
            -
                  "epoch": 0. | 
| 148 | 
            -
                  "grad_norm":  | 
| 149 | 
            -
                  "learning_rate": 1. | 
| 150 | 
             
                  "logits/chosen": NaN,
         | 
| 151 | 
            -
                  "logits/rejected":  | 
| 152 | 
            -
                  "logps/chosen": - | 
| 153 | 
            -
                  "logps/rejected": - | 
| 154 | 
            -
                  "loss": 0. | 
| 155 | 
            -
                  "rewards/accuracies": 0. | 
| 156 | 
            -
                  "rewards/chosen": -0. | 
| 157 | 
            -
                  "rewards/margins": 0. | 
| 158 | 
            -
                  "rewards/rejected": -0. | 
| 159 | 
             
                  "step": 50
         | 
| 160 | 
             
                },
         | 
| 161 | 
             
                {
         | 
| 162 | 
            -
                  "epoch": 0. | 
| 163 | 
            -
                  "grad_norm":  | 
| 164 | 
            -
                  "learning_rate": 1. | 
| 165 | 
             
                  "logits/chosen": NaN,
         | 
| 166 | 
             
                  "logits/rejected": NaN,
         | 
| 167 | 
            -
                  "logps/chosen": - | 
| 168 | 
            -
                  "logps/rejected": - | 
| 169 | 
            -
                  "loss": 0. | 
| 170 | 
            -
                  "rewards/accuracies": 0. | 
| 171 | 
            -
                  "rewards/chosen": -0. | 
| 172 | 
            -
                  "rewards/margins": 0. | 
| 173 | 
            -
                  "rewards/rejected": - | 
| 174 | 
             
                  "step": 55
         | 
| 175 | 
             
                },
         | 
| 176 | 
             
                {
         | 
| 177 | 
            -
                  "epoch": 0. | 
| 178 | 
            -
                  "grad_norm":  | 
| 179 | 
            -
                  "learning_rate": 1. | 
| 180 | 
             
                  "logits/chosen": NaN,
         | 
| 181 | 
             
                  "logits/rejected": NaN,
         | 
| 182 | 
            -
                  "logps/chosen": - | 
| 183 | 
            -
                  "logps/rejected": - | 
| 184 | 
            -
                  "loss": 0. | 
| 185 | 
            -
                  "rewards/accuracies": 0. | 
| 186 | 
            -
                  "rewards/chosen": -0. | 
| 187 | 
            -
                  "rewards/margins":  | 
| 188 | 
            -
                  "rewards/rejected": - | 
| 189 | 
             
                  "step": 60
         | 
| 190 | 
             
                },
         | 
| 191 | 
             
                {
         | 
| 192 | 
            -
                  "epoch": 0. | 
| 193 | 
            -
                  "grad_norm":  | 
| 194 | 
            -
                  "learning_rate": 1. | 
| 195 | 
             
                  "logits/chosen": NaN,
         | 
| 196 | 
             
                  "logits/rejected": NaN,
         | 
| 197 | 
            -
                  "logps/chosen": - | 
| 198 | 
            -
                  "logps/rejected": - | 
| 199 | 
            -
                  "loss": 0. | 
| 200 | 
            -
                  "rewards/accuracies": 0. | 
| 201 | 
            -
                  "rewards/chosen": -0. | 
| 202 | 
            -
                  "rewards/margins":  | 
| 203 | 
            -
                  "rewards/rejected": - | 
| 204 | 
             
                  "step": 65
         | 
| 205 | 
             
                },
         | 
| 206 | 
             
                {
         | 
| 207 | 
            -
                  "epoch": 0. | 
| 208 | 
            -
                  "grad_norm":  | 
| 209 | 
            -
                  "learning_rate": 1. | 
| 210 | 
             
                  "logits/chosen": NaN,
         | 
| 211 | 
             
                  "logits/rejected": NaN,
         | 
| 212 | 
            -
                  "logps/chosen": - | 
| 213 | 
            -
                  "logps/rejected": - | 
| 214 | 
            -
                  "loss": 0. | 
| 215 | 
            -
                  "rewards/accuracies": 0. | 
| 216 | 
            -
                  "rewards/chosen": -0. | 
| 217 | 
            -
                  "rewards/margins":  | 
| 218 | 
            -
                  "rewards/rejected": - | 
| 219 | 
             
                  "step": 70
         | 
| 220 | 
             
                },
         | 
| 221 | 
             
                {
         | 
| 222 | 
            -
                  "epoch": 0. | 
| 223 | 
            -
                  "grad_norm":  | 
| 224 | 
            -
                  "learning_rate": 1. | 
| 225 | 
             
                  "logits/chosen": NaN,
         | 
| 226 | 
             
                  "logits/rejected": NaN,
         | 
| 227 | 
            -
                  "logps/chosen": - | 
| 228 | 
            -
                  "logps/rejected": - | 
| 229 | 
            -
                  "loss": 0. | 
| 230 | 
            -
                  "rewards/accuracies": 0. | 
| 231 | 
            -
                  "rewards/chosen": -0. | 
| 232 | 
            -
                  "rewards/margins": 0. | 
| 233 | 
            -
                  "rewards/rejected": -0. | 
| 234 | 
             
                  "step": 75
         | 
| 235 | 
             
                },
         | 
| 236 | 
             
                {
         | 
| 237 | 
            -
                  "epoch": 0. | 
| 238 | 
            -
                  "grad_norm":  | 
| 239 | 
            -
                  "learning_rate": 1. | 
| 240 | 
             
                  "logits/chosen": NaN,
         | 
| 241 | 
            -
                  "logits/rejected":  | 
| 242 | 
            -
                  "logps/chosen": - | 
| 243 | 
            -
                  "logps/rejected": - | 
| 244 | 
            -
                  "loss": 0. | 
| 245 | 
            -
                  "rewards/accuracies": 0. | 
| 246 | 
            -
                  "rewards/chosen": -0. | 
| 247 | 
            -
                  "rewards/margins":  | 
| 248 | 
            -
                  "rewards/rejected": -1. | 
| 249 | 
             
                  "step": 80
         | 
| 250 | 
             
                },
         | 
| 251 | 
             
                {
         | 
| 252 | 
            -
                  "epoch": 0. | 
| 253 | 
            -
                  "grad_norm":  | 
| 254 | 
            -
                  "learning_rate": 1. | 
| 255 | 
             
                  "logits/chosen": NaN,
         | 
| 256 | 
             
                  "logits/rejected": NaN,
         | 
| 257 | 
            -
                  "logps/chosen": - | 
| 258 | 
            -
                  "logps/rejected": - | 
| 259 | 
            -
                  "loss": 0. | 
| 260 | 
            -
                  "rewards/accuracies": 0. | 
| 261 | 
            -
                  "rewards/chosen": -0. | 
| 262 | 
            -
                  "rewards/margins": 0. | 
| 263 | 
            -
                  "rewards/rejected": - | 
| 264 | 
             
                  "step": 85
         | 
| 265 | 
             
                },
         | 
| 266 | 
             
                {
         | 
| 267 | 
            -
                  "epoch":  | 
| 268 | 
            -
                  "grad_norm":  | 
| 269 | 
            -
                  "learning_rate": 1. | 
| 270 | 
             
                  "logits/chosen": NaN,
         | 
| 271 | 
            -
                  "logits/rejected":  | 
| 272 | 
            -
                  "logps/chosen": - | 
| 273 | 
            -
                  "logps/rejected": - | 
| 274 | 
            -
                  "loss": 0. | 
| 275 | 
            -
                  "rewards/accuracies": 0. | 
| 276 | 
            -
                  "rewards/chosen": -0. | 
| 277 | 
            -
                  "rewards/margins":  | 
| 278 | 
            -
                  "rewards/rejected": - | 
| 279 | 
             
                  "step": 90
         | 
| 280 | 
             
                },
         | 
| 281 | 
             
                {
         | 
| 282 | 
            -
                  "epoch":  | 
| 283 | 
            -
                  "grad_norm":  | 
| 284 | 
            -
                  "learning_rate":  | 
| 285 | 
             
                  "logits/chosen": NaN,
         | 
| 286 | 
             
                  "logits/rejected": NaN,
         | 
| 287 | 
            -
                  "logps/chosen": - | 
| 288 | 
            -
                  "logps/rejected": - | 
| 289 | 
            -
                  "loss": 0. | 
| 290 | 
            -
                  "rewards/accuracies": 0. | 
| 291 | 
            -
                  "rewards/chosen": -0. | 
| 292 | 
            -
                  "rewards/margins":  | 
| 293 | 
            -
                  "rewards/rejected": -1. | 
| 294 | 
             
                  "step": 95
         | 
| 295 | 
             
                },
         | 
| 296 | 
             
                {
         | 
| 297 | 
            -
                  "epoch": 1. | 
| 298 | 
            -
                  "grad_norm":  | 
| 299 | 
            -
                  "learning_rate":  | 
| 300 | 
             
                  "logits/chosen": NaN,
         | 
| 301 | 
             
                  "logits/rejected": NaN,
         | 
| 302 | 
            -
                  "logps/chosen": - | 
| 303 | 
            -
                  "logps/rejected": - | 
| 304 | 
            -
                  "loss": 0. | 
| 305 | 
            -
                  "rewards/accuracies": 0. | 
| 306 | 
            -
                  "rewards/chosen": -0. | 
| 307 | 
            -
                  "rewards/margins":  | 
| 308 | 
            -
                  "rewards/rejected": - | 
| 309 | 
             
                  "step": 100
         | 
| 310 | 
             
                },
         | 
| 311 | 
             
                {
         | 
| 312 | 
            -
                  "epoch": 1. | 
| 313 | 
            -
                  "grad_norm":  | 
| 314 | 
            -
                  "learning_rate":  | 
| 315 | 
             
                  "logits/chosen": NaN,
         | 
| 316 | 
             
                  "logits/rejected": NaN,
         | 
| 317 | 
            -
                  "logps/chosen": - | 
| 318 | 
            -
                  "logps/rejected": - | 
| 319 | 
            -
                  "loss": 0. | 
| 320 | 
            -
                  "rewards/accuracies": 0. | 
| 321 | 
            -
                  "rewards/chosen": -0. | 
| 322 | 
            -
                  "rewards/margins":  | 
| 323 | 
            -
                  "rewards/rejected": - | 
| 324 | 
             
                  "step": 105
         | 
| 325 | 
             
                },
         | 
| 326 | 
             
                {
         | 
| 327 | 
            -
                  "epoch": 1. | 
| 328 | 
            -
                  "grad_norm":  | 
| 329 | 
            -
                  "learning_rate":  | 
| 330 | 
             
                  "logits/chosen": NaN,
         | 
| 331 | 
            -
                  "logits/rejected":  | 
| 332 | 
            -
                  "logps/chosen": - | 
| 333 | 
            -
                  "logps/rejected": - | 
| 334 | 
            -
                  "loss": 0. | 
| 335 | 
            -
                  "rewards/accuracies": 0. | 
| 336 | 
            -
                  "rewards/chosen": -0. | 
| 337 | 
            -
                  "rewards/margins": 0. | 
| 338 | 
            -
                  "rewards/rejected": -0. | 
| 339 | 
             
                  "step": 110
         | 
| 340 | 
             
                },
         | 
| 341 | 
             
                {
         | 
| 342 | 
            -
                  "epoch": 1. | 
| 343 | 
            -
                  "grad_norm":  | 
| 344 | 
            -
                  "learning_rate":  | 
| 345 | 
             
                  "logits/chosen": NaN,
         | 
| 346 | 
             
                  "logits/rejected": NaN,
         | 
| 347 | 
            -
                  "logps/chosen": - | 
| 348 | 
            -
                  "logps/rejected": - | 
| 349 | 
            -
                  "loss": 0. | 
| 350 | 
            -
                  "rewards/accuracies": 0. | 
| 351 | 
            -
                  "rewards/chosen": -0. | 
| 352 | 
            -
                  "rewards/margins":  | 
| 353 | 
            -
                  "rewards/rejected": - | 
| 354 | 
             
                  "step": 115
         | 
| 355 | 
             
                },
         | 
| 356 | 
             
                {
         | 
| 357 | 
            -
                  "epoch": 1. | 
| 358 | 
            -
                  "grad_norm":  | 
| 359 | 
            -
                  "learning_rate":  | 
| 360 | 
             
                  "logits/chosen": NaN,
         | 
| 361 | 
            -
                  "logits/rejected":  | 
| 362 | 
            -
                  "logps/chosen": - | 
| 363 | 
            -
                  "logps/rejected": - | 
| 364 | 
            -
                  "loss": 0. | 
| 365 | 
            -
                  "rewards/accuracies": 0. | 
| 366 | 
            -
                  "rewards/chosen": -0. | 
| 367 | 
            -
                  "rewards/margins":  | 
| 368 | 
            -
                  "rewards/rejected": - | 
| 369 | 
             
                  "step": 120
         | 
| 370 | 
             
                },
         | 
| 371 | 
             
                {
         | 
| 372 | 
            -
                  "epoch": 1. | 
| 373 | 
            -
                  "grad_norm":  | 
| 374 | 
            -
                  "learning_rate":  | 
| 375 | 
             
                  "logits/chosen": NaN,
         | 
| 376 | 
             
                  "logits/rejected": NaN,
         | 
| 377 | 
            -
                  "logps/chosen": - | 
| 378 | 
            -
                  "logps/rejected": - | 
| 379 | 
            -
                  "loss": 0. | 
| 380 | 
            -
                  "rewards/accuracies": 0. | 
| 381 | 
            -
                  "rewards/chosen": -0. | 
| 382 | 
            -
                  "rewards/margins":  | 
| 383 | 
            -
                  "rewards/rejected": - | 
| 384 | 
             
                  "step": 125
         | 
| 385 | 
             
                },
         | 
| 386 | 
             
                {
         | 
| 387 | 
            -
                  "epoch": 1. | 
| 388 | 
            -
                  "grad_norm":  | 
| 389 | 
            -
                  "learning_rate":  | 
| 390 | 
             
                  "logits/chosen": NaN,
         | 
| 391 | 
             
                  "logits/rejected": NaN,
         | 
| 392 | 
            -
                  "logps/chosen": - | 
| 393 | 
            -
                  "logps/rejected": - | 
| 394 | 
            -
                  "loss": 0. | 
| 395 | 
             
                  "rewards/accuracies": 0.7593749761581421,
         | 
| 396 | 
            -
                  "rewards/chosen": -0. | 
| 397 | 
            -
                  "rewards/margins": 1. | 
| 398 | 
            -
                  "rewards/rejected": -1. | 
| 399 | 
             
                  "step": 130
         | 
| 400 | 
             
                },
         | 
| 401 | 
             
                {
         | 
| 402 | 
            -
                  "epoch": 1. | 
| 403 | 
            -
                  "grad_norm":  | 
| 404 | 
            -
                  "learning_rate":  | 
| 405 | 
             
                  "logits/chosen": NaN,
         | 
| 406 | 
             
                  "logits/rejected": NaN,
         | 
| 407 | 
            -
                  "logps/chosen": - | 
| 408 | 
            -
                  "logps/rejected": - | 
| 409 | 
            -
                  "loss": 0. | 
| 410 | 
            -
                  "rewards/accuracies": 0. | 
| 411 | 
            -
                  "rewards/chosen": -0. | 
| 412 | 
            -
                  "rewards/margins":  | 
| 413 | 
            -
                  "rewards/rejected": - | 
| 414 | 
             
                  "step": 135
         | 
| 415 | 
             
                },
         | 
| 416 | 
             
                {
         | 
| 417 | 
            -
                  "epoch": 1. | 
| 418 | 
            -
                  "grad_norm":  | 
| 419 | 
            -
                  "learning_rate":  | 
| 420 | 
             
                  "logits/chosen": NaN,
         | 
| 421 | 
             
                  "logits/rejected": NaN,
         | 
| 422 | 
            -
                  "logps/chosen": - | 
| 423 | 
            -
                  "logps/rejected": - | 
| 424 | 
            -
                  "loss": 0. | 
| 425 | 
            -
                  "rewards/accuracies": 0. | 
| 426 | 
            -
                  "rewards/chosen": -0. | 
| 427 | 
            -
                  "rewards/margins":  | 
| 428 | 
            -
                  "rewards/rejected": - | 
| 429 | 
             
                  "step": 140
         | 
| 430 | 
             
                },
         | 
| 431 | 
             
                {
         | 
| 432 | 
            -
                  "epoch": 1. | 
| 433 | 
            -
                  "grad_norm":  | 
| 434 | 
            -
                  "learning_rate":  | 
| 435 | 
             
                  "logits/chosen": NaN,
         | 
| 436 | 
            -
                  "logits/rejected":  | 
| 437 | 
            -
                  "logps/chosen": - | 
| 438 | 
            -
                  "logps/rejected": - | 
| 439 | 
            -
                  "loss": 0. | 
| 440 | 
            -
                  "rewards/accuracies": 0. | 
| 441 | 
            -
                  "rewards/chosen": -0. | 
| 442 | 
            -
                  "rewards/margins":  | 
| 443 | 
            -
                  "rewards/rejected": - | 
| 444 | 
             
                  "step": 145
         | 
| 445 | 
             
                },
         | 
| 446 | 
             
                {
         | 
| 447 | 
            -
                  "epoch": 1. | 
| 448 | 
            -
                  "grad_norm":  | 
| 449 | 
            -
                  "learning_rate":  | 
| 450 | 
             
                  "logits/chosen": NaN,
         | 
| 451 | 
            -
                  "logits/rejected":  | 
| 452 | 
            -
                  "logps/chosen": - | 
| 453 | 
            -
                  "logps/rejected": - | 
| 454 | 
            -
                  "loss": 0. | 
| 455 | 
            -
                  "rewards/accuracies": 0. | 
| 456 | 
            -
                  "rewards/chosen": -0. | 
| 457 | 
            -
                  "rewards/margins":  | 
| 458 | 
            -
                  "rewards/rejected": - | 
| 459 | 
             
                  "step": 150
         | 
| 460 | 
             
                },
         | 
| 461 | 
             
                {
         | 
| 462 | 
            -
                  "epoch": 1. | 
| 463 | 
            -
                  "grad_norm":  | 
| 464 | 
            -
                  "learning_rate":  | 
| 465 | 
             
                  "logits/chosen": NaN,
         | 
| 466 | 
             
                  "logits/rejected": NaN,
         | 
| 467 | 
            -
                  "logps/chosen": - | 
| 468 | 
            -
                  "logps/rejected": - | 
| 469 | 
            -
                  "loss": 0. | 
| 470 | 
            -
                  "rewards/accuracies": 0. | 
| 471 | 
            -
                  "rewards/chosen": -0. | 
| 472 | 
            -
                  "rewards/margins":  | 
| 473 | 
            -
                  "rewards/rejected": - | 
| 474 | 
             
                  "step": 155
         | 
| 475 | 
             
                },
         | 
| 476 | 
             
                {
         | 
| 477 | 
            -
                  "epoch": 1. | 
| 478 | 
            -
                  "grad_norm":  | 
| 479 | 
            -
                  "learning_rate":  | 
| 480 | 
             
                  "logits/chosen": NaN,
         | 
| 481 | 
            -
                  "logits/rejected":  | 
| 482 | 
            -
                  "logps/chosen": - | 
| 483 | 
            -
                  "logps/rejected": - | 
| 484 | 
            -
                  "loss": 0. | 
| 485 | 
            -
                  "rewards/accuracies": 0. | 
| 486 | 
            -
                  "rewards/chosen": -0. | 
| 487 | 
            -
                  "rewards/margins":  | 
| 488 | 
            -
                  "rewards/rejected": - | 
| 489 | 
             
                  "step": 160
         | 
| 490 | 
             
                },
         | 
| 491 | 
             
                {
         | 
| 492 | 
            -
                  "epoch": 1. | 
| 493 | 
            -
                  "grad_norm": 41. | 
| 494 | 
            -
                  "learning_rate":  | 
| 495 | 
             
                  "logits/chosen": NaN,
         | 
| 496 | 
            -
                  "logits/rejected":  | 
| 497 | 
            -
                  "logps/chosen": - | 
| 498 | 
            -
                  "logps/rejected": - | 
| 499 | 
            -
                  "loss": 0. | 
| 500 | 
            -
                  "rewards/accuracies": 0. | 
| 501 | 
            -
                  "rewards/chosen": -0. | 
| 502 | 
            -
                  "rewards/margins": 1. | 
| 503 | 
            -
                  "rewards/rejected": -1. | 
| 504 | 
             
                  "step": 165
         | 
| 505 | 
             
                },
         | 
| 506 | 
             
                {
         | 
| 507 | 
            -
                  "epoch": 1. | 
| 508 | 
            -
                  "grad_norm":  | 
| 509 | 
            -
                  "learning_rate":  | 
| 510 | 
             
                  "logits/chosen": NaN,
         | 
| 511 | 
            -
                  "logits/rejected":  | 
| 512 | 
            -
                  "logps/chosen": - | 
| 513 | 
            -
                  "logps/rejected": - | 
| 514 | 
            -
                  "loss": 0. | 
| 515 | 
            -
                  "rewards/accuracies": 0. | 
| 516 | 
            -
                  "rewards/chosen": -0. | 
| 517 | 
            -
                  "rewards/margins": 1. | 
| 518 | 
            -
                  "rewards/rejected": -1. | 
| 519 | 
             
                  "step": 170
         | 
| 520 | 
             
                },
         | 
| 521 | 
             
                {
         | 
| 522 | 
            -
                  "epoch": 1. | 
| 523 | 
            -
                  " | 
| 524 | 
            -
                  "learning_rate": 2.4468085106382976e-08,
         | 
| 525 | 
            -
                  "logits/chosen": NaN,
         | 
| 526 | 
            -
                  "logits/rejected": NaN,
         | 
| 527 | 
            -
                  "logps/chosen": -350.2046813964844,
         | 
| 528 | 
            -
                  "logps/rejected": -433.5687561035156,
         | 
| 529 | 
            -
                  "loss": 0.4755,
         | 
| 530 | 
            -
                  "rewards/accuracies": 0.8031250238418579,
         | 
| 531 | 
            -
                  "rewards/chosen": -0.027071380987763405,
         | 
| 532 | 
            -
                  "rewards/margins": 1.4462082386016846,
         | 
| 533 | 
            -
                  "rewards/rejected": -1.4738037586212158,
         | 
| 534 | 
            -
                  "step": 175
         | 
| 535 | 
            -
                },
         | 
| 536 | 
            -
                {
         | 
| 537 | 
            -
                  "epoch": 1.8147448015122873,
         | 
| 538 | 
            -
                  "grad_norm": 46.67757830253006,
         | 
| 539 | 
            -
                  "learning_rate": 1.9148936170212764e-08,
         | 
| 540 | 
            -
                  "logits/chosen": NaN,
         | 
| 541 | 
            -
                  "logits/rejected": NaN,
         | 
| 542 | 
            -
                  "logps/chosen": -398.32501220703125,
         | 
| 543 | 
            -
                  "logps/rejected": -456.9437561035156,
         | 
| 544 | 
            -
                  "loss": 0.4807,
         | 
| 545 | 
            -
                  "rewards/accuracies": 0.746874988079071,
         | 
| 546 | 
            -
                  "rewards/chosen": -0.05414886400103569,
         | 
| 547 | 
            -
                  "rewards/margins": 1.2570632696151733,
         | 
| 548 | 
            -
                  "rewards/rejected": -1.310980200767517,
         | 
| 549 | 
            -
                  "step": 180
         | 
| 550 | 
            -
                },
         | 
| 551 | 
            -
                {
         | 
| 552 | 
            -
                  "epoch": 1.865154379332073,
         | 
| 553 | 
            -
                  "grad_norm": 43.636875496682755,
         | 
| 554 | 
            -
                  "learning_rate": 1.3829787234042552e-08,
         | 
| 555 | 
            -
                  "logits/chosen": NaN,
         | 
| 556 | 
            -
                  "logits/rejected": -0.17527160048484802,
         | 
| 557 | 
            -
                  "logps/chosen": -373.87188720703125,
         | 
| 558 | 
            -
                  "logps/rejected": -535.796875,
         | 
| 559 | 
            -
                  "loss": 0.4376,
         | 
| 560 | 
            -
                  "rewards/accuracies": 0.796875,
         | 
| 561 | 
            -
                  "rewards/chosen": -0.026004791259765625,
         | 
| 562 | 
            -
                  "rewards/margins": 1.6229279041290283,
         | 
| 563 | 
            -
                  "rewards/rejected": -1.649743676185608,
         | 
| 564 | 
            -
                  "step": 185
         | 
| 565 | 
            -
                },
         | 
| 566 | 
            -
                {
         | 
| 567 | 
            -
                  "epoch": 1.9155639571518588,
         | 
| 568 | 
            -
                  "grad_norm": 41.753332033000945,
         | 
| 569 | 
            -
                  "learning_rate": 8.510638297872339e-09,
         | 
| 570 | 
            -
                  "logits/chosen": NaN,
         | 
| 571 | 
            -
                  "logits/rejected": NaN,
         | 
| 572 | 
            -
                  "logps/chosen": -360.0718688964844,
         | 
| 573 | 
            -
                  "logps/rejected": -468.2124938964844,
         | 
| 574 | 
            -
                  "loss": 0.4872,
         | 
| 575 | 
            -
                  "rewards/accuracies": 0.731249988079071,
         | 
| 576 | 
            -
                  "rewards/chosen": -0.11498375236988068,
         | 
| 577 | 
            -
                  "rewards/margins": 1.2304840087890625,
         | 
| 578 | 
            -
                  "rewards/rejected": -1.345800757408142,
         | 
| 579 | 
            -
                  "step": 190
         | 
| 580 | 
            -
                },
         | 
| 581 | 
            -
                {
         | 
| 582 | 
            -
                  "epoch": 1.9659735349716447,
         | 
| 583 | 
            -
                  "grad_norm": 43.377545693326894,
         | 
| 584 | 
            -
                  "learning_rate": 3.1914893617021273e-09,
         | 
| 585 | 
            -
                  "logits/chosen": NaN,
         | 
| 586 | 
            -
                  "logits/rejected": NaN,
         | 
| 587 | 
            -
                  "logps/chosen": -365.515625,
         | 
| 588 | 
            -
                  "logps/rejected": -478.56561279296875,
         | 
| 589 | 
            -
                  "loss": 0.4805,
         | 
| 590 | 
            -
                  "rewards/accuracies": 0.7718750238418579,
         | 
| 591 | 
            -
                  "rewards/chosen": -0.04032173007726669,
         | 
| 592 | 
            -
                  "rewards/margins": 1.3543853759765625,
         | 
| 593 | 
            -
                  "rewards/rejected": -1.39520263671875,
         | 
| 594 | 
            -
                  "step": 195
         | 
| 595 | 
            -
                },
         | 
| 596 | 
            -
                {
         | 
| 597 | 
            -
                  "epoch": 1.996219281663516,
         | 
| 598 | 
            -
                  "step": 198,
         | 
| 599 | 
             
                  "total_flos": 0.0,
         | 
| 600 | 
            -
                  "train_loss": 0. | 
| 601 | 
            -
                  "train_runtime":  | 
| 602 | 
            -
                  "train_samples_per_second": 5. | 
| 603 | 
            -
                  "train_steps_per_second": 0. | 
| 604 | 
             
                }
         | 
| 605 | 
             
              ],
         | 
| 606 | 
             
              "logging_steps": 5,
         | 
| 607 | 
            -
              "max_steps":  | 
| 608 | 
             
              "num_input_tokens_seen": 0,
         | 
| 609 | 
             
              "num_train_epochs": 2,
         | 
| 610 | 
             
              "save_steps": 50,
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "best_metric": null,
         | 
| 3 | 
             
              "best_model_checkpoint": null,
         | 
| 4 | 
            +
              "epoch": 1.9971305595408895,
         | 
| 5 | 
             
              "eval_steps": 500,
         | 
| 6 | 
            +
              "global_step": 174,
         | 
| 7 | 
             
              "is_hyper_param_search": false,
         | 
| 8 | 
             
              "is_local_process_zero": true,
         | 
| 9 | 
             
              "is_world_process_zero": true,
         | 
| 10 | 
             
              "log_history": [
         | 
| 11 | 
             
                {
         | 
| 12 | 
            +
                  "epoch": 0.05738880918220947,
         | 
| 13 | 
            +
                  "grad_norm": 147.0504219778771,
         | 
| 14 | 
            +
                  "learning_rate": 1.1111111111111111e-07,
         | 
| 15 | 
             
                  "logits/chosen": NaN,
         | 
| 16 | 
             
                  "logits/rejected": NaN,
         | 
| 17 | 
            +
                  "logps/chosen": -309.26251220703125,
         | 
| 18 | 
            +
                  "logps/rejected": -410.8433532714844,
         | 
| 19 | 
            +
                  "loss": 0.6899,
         | 
| 20 | 
            +
                  "rewards/accuracies": 0.17499999701976776,
         | 
| 21 | 
            +
                  "rewards/chosen": -0.002740192459896207,
         | 
| 22 | 
            +
                  "rewards/margins": 0.0071624754928052425,
         | 
| 23 | 
            +
                  "rewards/rejected": -0.00990285910665989,
         | 
| 24 | 
             
                  "step": 5
         | 
| 25 | 
             
                },
         | 
| 26 | 
             
                {
         | 
| 27 | 
            +
                  "epoch": 0.11477761836441894,
         | 
| 28 | 
            +
                  "grad_norm": 123.58633090508155,
         | 
| 29 | 
            +
                  "learning_rate": 1.9878787878787876e-07,
         | 
| 30 | 
             
                  "logits/chosen": NaN,
         | 
| 31 | 
             
                  "logits/rejected": NaN,
         | 
| 32 | 
            +
                  "logps/chosen": -277.09844970703125,
         | 
| 33 | 
            +
                  "logps/rejected": -374.31719970703125,
         | 
| 34 | 
            +
                  "loss": 0.6926,
         | 
| 35 | 
            +
                  "rewards/accuracies": 0.2718749940395355,
         | 
| 36 | 
            +
                  "rewards/chosen": 0.0009648323175497353,
         | 
| 37 | 
            +
                  "rewards/margins": 0.00359344482421875,
         | 
| 38 | 
            +
                  "rewards/rejected": -0.0026039122603833675,
         | 
| 39 | 
             
                  "step": 10
         | 
| 40 | 
             
                },
         | 
| 41 | 
             
                {
         | 
| 42 | 
            +
                  "epoch": 0.17216642754662842,
         | 
| 43 | 
            +
                  "grad_norm": 82.52640749761395,
         | 
| 44 | 
            +
                  "learning_rate": 1.9272727272727272e-07,
         | 
| 45 | 
             
                  "logits/chosen": NaN,
         | 
| 46 | 
            +
                  "logits/rejected": -0.355844110250473,
         | 
| 47 | 
            +
                  "logps/chosen": -250.90859985351562,
         | 
| 48 | 
            +
                  "logps/rejected": -288.7484436035156,
         | 
| 49 | 
            +
                  "loss": 0.6679,
         | 
| 50 | 
            +
                  "rewards/accuracies": 0.3656249940395355,
         | 
| 51 | 
            +
                  "rewards/chosen": -0.018105220049619675,
         | 
| 52 | 
            +
                  "rewards/margins": 0.07349129021167755,
         | 
| 53 | 
            +
                  "rewards/rejected": -0.09159164130687714,
         | 
| 54 | 
             
                  "step": 15
         | 
| 55 | 
             
                },
         | 
| 56 | 
             
                {
         | 
| 57 | 
            +
                  "epoch": 0.22955523672883787,
         | 
| 58 | 
            +
                  "grad_norm": 46.811267261200435,
         | 
| 59 | 
            +
                  "learning_rate": 1.8666666666666667e-07,
         | 
| 60 | 
             
                  "logits/chosen": NaN,
         | 
| 61 | 
             
                  "logits/rejected": NaN,
         | 
| 62 | 
            +
                  "logps/chosen": -250.43203735351562,
         | 
| 63 | 
            +
                  "logps/rejected": -299.9468688964844,
         | 
| 64 | 
            +
                  "loss": 0.638,
         | 
| 65 | 
            +
                  "rewards/accuracies": 0.4124999940395355,
         | 
| 66 | 
            +
                  "rewards/chosen": -0.02562398836016655,
         | 
| 67 | 
            +
                  "rewards/margins": 0.21783074736595154,
         | 
| 68 | 
            +
                  "rewards/rejected": -0.24351105093955994,
         | 
| 69 | 
             
                  "step": 20
         | 
| 70 | 
             
                },
         | 
| 71 | 
             
                {
         | 
| 72 | 
            +
                  "epoch": 0.28694404591104733,
         | 
| 73 | 
            +
                  "grad_norm": 44.812641031283796,
         | 
| 74 | 
            +
                  "learning_rate": 1.806060606060606e-07,
         | 
| 75 | 
             
                  "logits/chosen": NaN,
         | 
| 76 | 
            +
                  "logits/rejected": -0.32989805936813354,
         | 
| 77 | 
            +
                  "logps/chosen": -264.3968811035156,
         | 
| 78 | 
            +
                  "logps/rejected": -420.49688720703125,
         | 
| 79 | 
            +
                  "loss": 0.6218,
         | 
| 80 | 
            +
                  "rewards/accuracies": 0.4625000059604645,
         | 
| 81 | 
            +
                  "rewards/chosen": -0.05606970936059952,
         | 
| 82 | 
            +
                  "rewards/margins": 0.3526493012905121,
         | 
| 83 | 
            +
                  "rewards/rejected": -0.4085969924926758,
         | 
| 84 | 
             
                  "step": 25
         | 
| 85 | 
             
                },
         | 
| 86 | 
             
                {
         | 
| 87 | 
            +
                  "epoch": 0.34433285509325684,
         | 
| 88 | 
            +
                  "grad_norm": 41.09665846804977,
         | 
| 89 | 
            +
                  "learning_rate": 1.7454545454545453e-07,
         | 
| 90 | 
             
                  "logits/chosen": NaN,
         | 
| 91 | 
             
                  "logits/rejected": NaN,
         | 
| 92 | 
            +
                  "logps/chosen": -261.12969970703125,
         | 
| 93 | 
            +
                  "logps/rejected": -333.25665283203125,
         | 
| 94 | 
            +
                  "loss": 0.6433,
         | 
| 95 | 
            +
                  "rewards/accuracies": 0.453125,
         | 
| 96 | 
            +
                  "rewards/chosen": -0.05041093751788139,
         | 
| 97 | 
            +
                  "rewards/margins": 0.3066027760505676,
         | 
| 98 | 
            +
                  "rewards/rejected": -0.3569931983947754,
         | 
| 99 | 
             
                  "step": 30
         | 
| 100 | 
             
                },
         | 
| 101 | 
             
                {
         | 
| 102 | 
            +
                  "epoch": 0.4017216642754663,
         | 
| 103 | 
            +
                  "grad_norm": 47.66979279242685,
         | 
| 104 | 
            +
                  "learning_rate": 1.6848484848484848e-07,
         | 
| 105 | 
             
                  "logits/chosen": NaN,
         | 
| 106 | 
             
                  "logits/rejected": NaN,
         | 
| 107 | 
            +
                  "logps/chosen": -252.76406860351562,
         | 
| 108 | 
            +
                  "logps/rejected": -320.32342529296875,
         | 
| 109 | 
            +
                  "loss": 0.611,
         | 
| 110 | 
            +
                  "rewards/accuracies": 0.546875,
         | 
| 111 | 
            +
                  "rewards/chosen": -0.07408180087804794,
         | 
| 112 | 
            +
                  "rewards/margins": 0.735063910484314,
         | 
| 113 | 
            +
                  "rewards/rejected": -0.8097826838493347,
         | 
| 114 | 
             
                  "step": 35
         | 
| 115 | 
             
                },
         | 
| 116 | 
             
                {
         | 
| 117 | 
            +
                  "epoch": 0.45911047345767575,
         | 
| 118 | 
            +
                  "grad_norm": 44.453842871184655,
         | 
| 119 | 
            +
                  "learning_rate": 1.624242424242424e-07,
         | 
| 120 | 
             
                  "logits/chosen": NaN,
         | 
| 121 | 
             
                  "logits/rejected": NaN,
         | 
| 122 | 
            +
                  "logps/chosen": -282.52734375,
         | 
| 123 | 
            +
                  "logps/rejected": -314.4585876464844,
         | 
| 124 | 
            +
                  "loss": 0.6154,
         | 
| 125 | 
            +
                  "rewards/accuracies": 0.550000011920929,
         | 
| 126 | 
            +
                  "rewards/chosen": -0.08205080032348633,
         | 
| 127 | 
            +
                  "rewards/margins": 0.6658231616020203,
         | 
| 128 | 
            +
                  "rewards/rejected": -0.7472448348999023,
         | 
| 129 | 
             
                  "step": 40
         | 
| 130 | 
             
                },
         | 
| 131 | 
             
                {
         | 
| 132 | 
            +
                  "epoch": 0.5164992826398852,
         | 
| 133 | 
            +
                  "grad_norm": 72.40873860390725,
         | 
| 134 | 
            +
                  "learning_rate": 1.5636363636363637e-07,
         | 
| 135 | 
             
                  "logits/chosen": NaN,
         | 
| 136 | 
            +
                  "logits/rejected": NaN,
         | 
| 137 | 
            +
                  "logps/chosen": -246.61483764648438,
         | 
| 138 | 
            +
                  "logps/rejected": -324.8515625,
         | 
| 139 | 
            +
                  "loss": 0.6541,
         | 
| 140 | 
            +
                  "rewards/accuracies": 0.5562499761581421,
         | 
| 141 | 
            +
                  "rewards/chosen": -0.14140835404396057,
         | 
| 142 | 
            +
                  "rewards/margins": 0.6368468999862671,
         | 
| 143 | 
            +
                  "rewards/rejected": -0.7777351140975952,
         | 
| 144 | 
             
                  "step": 45
         | 
| 145 | 
             
                },
         | 
| 146 | 
             
                {
         | 
| 147 | 
            +
                  "epoch": 0.5738880918220947,
         | 
| 148 | 
            +
                  "grad_norm": 51.33061405888651,
         | 
| 149 | 
            +
                  "learning_rate": 1.503030303030303e-07,
         | 
| 150 | 
             
                  "logits/chosen": NaN,
         | 
| 151 | 
            +
                  "logits/rejected": -0.29754638671875,
         | 
| 152 | 
            +
                  "logps/chosen": -251.82656860351562,
         | 
| 153 | 
            +
                  "logps/rejected": -364.16094970703125,
         | 
| 154 | 
            +
                  "loss": 0.6061,
         | 
| 155 | 
            +
                  "rewards/accuracies": 0.5687500238418579,
         | 
| 156 | 
            +
                  "rewards/chosen": -0.097315214574337,
         | 
| 157 | 
            +
                  "rewards/margins": 0.7471939325332642,
         | 
| 158 | 
            +
                  "rewards/rejected": -0.8438205718994141,
         | 
| 159 | 
             
                  "step": 50
         | 
| 160 | 
             
                },
         | 
| 161 | 
             
                {
         | 
| 162 | 
            +
                  "epoch": 0.6312769010043041,
         | 
| 163 | 
            +
                  "grad_norm": 47.78546749711004,
         | 
| 164 | 
            +
                  "learning_rate": 1.4424242424242422e-07,
         | 
| 165 | 
             
                  "logits/chosen": NaN,
         | 
| 166 | 
             
                  "logits/rejected": NaN,
         | 
| 167 | 
            +
                  "logps/chosen": -229.6281280517578,
         | 
| 168 | 
            +
                  "logps/rejected": -346.20623779296875,
         | 
| 169 | 
            +
                  "loss": 0.5885,
         | 
| 170 | 
            +
                  "rewards/accuracies": 0.596875011920929,
         | 
| 171 | 
            +
                  "rewards/chosen": -0.08479080349206924,
         | 
| 172 | 
            +
                  "rewards/margins": 0.9593955874443054,
         | 
| 173 | 
            +
                  "rewards/rejected": -1.0437196493148804,
         | 
| 174 | 
             
                  "step": 55
         | 
| 175 | 
             
                },
         | 
| 176 | 
             
                {
         | 
| 177 | 
            +
                  "epoch": 0.6886657101865137,
         | 
| 178 | 
            +
                  "grad_norm": 32.88856633193689,
         | 
| 179 | 
            +
                  "learning_rate": 1.3818181818181818e-07,
         | 
| 180 | 
             
                  "logits/chosen": NaN,
         | 
| 181 | 
             
                  "logits/rejected": NaN,
         | 
| 182 | 
            +
                  "logps/chosen": -259.2515563964844,
         | 
| 183 | 
            +
                  "logps/rejected": -317.92498779296875,
         | 
| 184 | 
            +
                  "loss": 0.6109,
         | 
| 185 | 
            +
                  "rewards/accuracies": 0.640625,
         | 
| 186 | 
            +
                  "rewards/chosen": -0.09950466454029083,
         | 
| 187 | 
            +
                  "rewards/margins": 0.8476117849349976,
         | 
| 188 | 
            +
                  "rewards/rejected": -0.9464820623397827,
         | 
| 189 | 
             
                  "step": 60
         | 
| 190 | 
             
                },
         | 
| 191 | 
             
                {
         | 
| 192 | 
            +
                  "epoch": 0.7460545193687231,
         | 
| 193 | 
            +
                  "grad_norm": 46.246379021501845,
         | 
| 194 | 
            +
                  "learning_rate": 1.3212121212121213e-07,
         | 
| 195 | 
             
                  "logits/chosen": NaN,
         | 
| 196 | 
             
                  "logits/rejected": NaN,
         | 
| 197 | 
            +
                  "logps/chosen": -258.0960998535156,
         | 
| 198 | 
            +
                  "logps/rejected": -290.5296936035156,
         | 
| 199 | 
            +
                  "loss": 0.617,
         | 
| 200 | 
            +
                  "rewards/accuracies": 0.640625,
         | 
| 201 | 
            +
                  "rewards/chosen": -0.1353795975446701,
         | 
| 202 | 
            +
                  "rewards/margins": 0.7194949984550476,
         | 
| 203 | 
            +
                  "rewards/rejected": -0.8551372289657593,
         | 
| 204 | 
             
                  "step": 65
         | 
| 205 | 
             
                },
         | 
| 206 | 
             
                {
         | 
| 207 | 
            +
                  "epoch": 0.8034433285509326,
         | 
| 208 | 
            +
                  "grad_norm": 51.298985955922575,
         | 
| 209 | 
            +
                  "learning_rate": 1.2606060606060603e-07,
         | 
| 210 | 
             
                  "logits/chosen": NaN,
         | 
| 211 | 
             
                  "logits/rejected": NaN,
         | 
| 212 | 
            +
                  "logps/chosen": -290.3218688964844,
         | 
| 213 | 
            +
                  "logps/rejected": -378.1734313964844,
         | 
| 214 | 
            +
                  "loss": 0.5908,
         | 
| 215 | 
            +
                  "rewards/accuracies": 0.65625,
         | 
| 216 | 
            +
                  "rewards/chosen": -0.12958745658397675,
         | 
| 217 | 
            +
                  "rewards/margins": 0.8194991946220398,
         | 
| 218 | 
            +
                  "rewards/rejected": -0.9491798281669617,
         | 
| 219 | 
             
                  "step": 70
         | 
| 220 | 
             
                },
         | 
| 221 | 
             
                {
         | 
| 222 | 
            +
                  "epoch": 0.860832137733142,
         | 
| 223 | 
            +
                  "grad_norm": 51.37452979537066,
         | 
| 224 | 
            +
                  "learning_rate": 1.2e-07,
         | 
| 225 | 
             
                  "logits/chosen": NaN,
         | 
| 226 | 
             
                  "logits/rejected": NaN,
         | 
| 227 | 
            +
                  "logps/chosen": -272.2359313964844,
         | 
| 228 | 
            +
                  "logps/rejected": -461.9546813964844,
         | 
| 229 | 
            +
                  "loss": 0.6139,
         | 
| 230 | 
            +
                  "rewards/accuracies": 0.596875011920929,
         | 
| 231 | 
            +
                  "rewards/chosen": -0.08674906194210052,
         | 
| 232 | 
            +
                  "rewards/margins": 0.7904602289199829,
         | 
| 233 | 
            +
                  "rewards/rejected": -0.8770895004272461,
         | 
| 234 | 
             
                  "step": 75
         | 
| 235 | 
             
                },
         | 
| 236 | 
             
                {
         | 
| 237 | 
            +
                  "epoch": 0.9182209469153515,
         | 
| 238 | 
            +
                  "grad_norm": 49.12644933327405,
         | 
| 239 | 
            +
                  "learning_rate": 1.1393939393939393e-07,
         | 
| 240 | 
             
                  "logits/chosen": NaN,
         | 
| 241 | 
            +
                  "logits/rejected": NaN,
         | 
| 242 | 
            +
                  "logps/chosen": -271.92657470703125,
         | 
| 243 | 
            +
                  "logps/rejected": -385.4671936035156,
         | 
| 244 | 
            +
                  "loss": 0.5887,
         | 
| 245 | 
            +
                  "rewards/accuracies": 0.6343749761581421,
         | 
| 246 | 
            +
                  "rewards/chosen": -0.12120027840137482,
         | 
| 247 | 
            +
                  "rewards/margins": 0.9173402786254883,
         | 
| 248 | 
            +
                  "rewards/rejected": -1.0387518405914307,
         | 
| 249 | 
             
                  "step": 80
         | 
| 250 | 
             
                },
         | 
| 251 | 
             
                {
         | 
| 252 | 
            +
                  "epoch": 0.975609756097561,
         | 
| 253 | 
            +
                  "grad_norm": 78.67875740741145,
         | 
| 254 | 
            +
                  "learning_rate": 1.0787878787878789e-07,
         | 
| 255 | 
             
                  "logits/chosen": NaN,
         | 
| 256 | 
             
                  "logits/rejected": NaN,
         | 
| 257 | 
            +
                  "logps/chosen": -268.3734436035156,
         | 
| 258 | 
            +
                  "logps/rejected": -356.06561279296875,
         | 
| 259 | 
            +
                  "loss": 0.6094,
         | 
| 260 | 
            +
                  "rewards/accuracies": 0.621874988079071,
         | 
| 261 | 
            +
                  "rewards/chosen": -0.08560104668140411,
         | 
| 262 | 
            +
                  "rewards/margins": 0.7130492925643921,
         | 
| 263 | 
            +
                  "rewards/rejected": -0.7986106872558594,
         | 
| 264 | 
             
                  "step": 85
         | 
| 265 | 
             
                },
         | 
| 266 | 
             
                {
         | 
| 267 | 
            +
                  "epoch": 1.0329985652797704,
         | 
| 268 | 
            +
                  "grad_norm": 37.99397160982797,
         | 
| 269 | 
            +
                  "learning_rate": 1.018181818181818e-07,
         | 
| 270 | 
             
                  "logits/chosen": NaN,
         | 
| 271 | 
            +
                  "logits/rejected": -0.2726287841796875,
         | 
| 272 | 
            +
                  "logps/chosen": -278.57501220703125,
         | 
| 273 | 
            +
                  "logps/rejected": -375.9390563964844,
         | 
| 274 | 
            +
                  "loss": 0.5887,
         | 
| 275 | 
            +
                  "rewards/accuracies": 0.6499999761581421,
         | 
| 276 | 
            +
                  "rewards/chosen": -0.10773544013500214,
         | 
| 277 | 
            +
                  "rewards/margins": 0.7839363217353821,
         | 
| 278 | 
            +
                  "rewards/rejected": -0.892169177532196,
         | 
| 279 | 
             
                  "step": 90
         | 
| 280 | 
             
                },
         | 
| 281 | 
             
                {
         | 
| 282 | 
            +
                  "epoch": 1.0903873744619799,
         | 
| 283 | 
            +
                  "grad_norm": 38.13934723090496,
         | 
| 284 | 
            +
                  "learning_rate": 9.575757575757574e-08,
         | 
| 285 | 
             
                  "logits/chosen": NaN,
         | 
| 286 | 
             
                  "logits/rejected": NaN,
         | 
| 287 | 
            +
                  "logps/chosen": -287.20001220703125,
         | 
| 288 | 
            +
                  "logps/rejected": -351.8617248535156,
         | 
| 289 | 
            +
                  "loss": 0.5602,
         | 
| 290 | 
            +
                  "rewards/accuracies": 0.684374988079071,
         | 
| 291 | 
            +
                  "rewards/chosen": -0.09619579464197159,
         | 
| 292 | 
            +
                  "rewards/margins": 0.9326726794242859,
         | 
| 293 | 
            +
                  "rewards/rejected": -1.0286362171173096,
         | 
| 294 | 
             
                  "step": 95
         | 
| 295 | 
             
                },
         | 
| 296 | 
             
                {
         | 
| 297 | 
            +
                  "epoch": 1.1477761836441893,
         | 
| 298 | 
            +
                  "grad_norm": 59.66676077313003,
         | 
| 299 | 
            +
                  "learning_rate": 8.96969696969697e-08,
         | 
| 300 | 
             
                  "logits/chosen": NaN,
         | 
| 301 | 
             
                  "logits/rejected": NaN,
         | 
| 302 | 
            +
                  "logps/chosen": -253.2980499267578,
         | 
| 303 | 
            +
                  "logps/rejected": -323.6937561035156,
         | 
| 304 | 
            +
                  "loss": 0.5715,
         | 
| 305 | 
            +
                  "rewards/accuracies": 0.721875011920929,
         | 
| 306 | 
            +
                  "rewards/chosen": -0.09653882682323456,
         | 
| 307 | 
            +
                  "rewards/margins": 0.7721735239028931,
         | 
| 308 | 
            +
                  "rewards/rejected": -0.8687639236450195,
         | 
| 309 | 
             
                  "step": 100
         | 
| 310 | 
             
                },
         | 
| 311 | 
             
                {
         | 
| 312 | 
            +
                  "epoch": 1.2051649928263988,
         | 
| 313 | 
            +
                  "grad_norm": 40.181942906152656,
         | 
| 314 | 
            +
                  "learning_rate": 8.363636363636363e-08,
         | 
| 315 | 
             
                  "logits/chosen": NaN,
         | 
| 316 | 
             
                  "logits/rejected": NaN,
         | 
| 317 | 
            +
                  "logps/chosen": -269.91796875,
         | 
| 318 | 
            +
                  "logps/rejected": -342.6031188964844,
         | 
| 319 | 
            +
                  "loss": 0.5733,
         | 
| 320 | 
            +
                  "rewards/accuracies": 0.6781250238418579,
         | 
| 321 | 
            +
                  "rewards/chosen": -0.1024196594953537,
         | 
| 322 | 
            +
                  "rewards/margins": 0.6281814575195312,
         | 
| 323 | 
            +
                  "rewards/rejected": -0.7303474545478821,
         | 
| 324 | 
             
                  "step": 105
         | 
| 325 | 
             
                },
         | 
| 326 | 
             
                {
         | 
| 327 | 
            +
                  "epoch": 1.2625538020086085,
         | 
| 328 | 
            +
                  "grad_norm": 43.130194806696174,
         | 
| 329 | 
            +
                  "learning_rate": 7.757575757575757e-08,
         | 
| 330 | 
             
                  "logits/chosen": NaN,
         | 
| 331 | 
            +
                  "logits/rejected": -0.28910523653030396,
         | 
| 332 | 
            +
                  "logps/chosen": -256.4398498535156,
         | 
| 333 | 
            +
                  "logps/rejected": -315.8851623535156,
         | 
| 334 | 
            +
                  "loss": 0.5609,
         | 
| 335 | 
            +
                  "rewards/accuracies": 0.7406250238418579,
         | 
| 336 | 
            +
                  "rewards/chosen": -0.05760955810546875,
         | 
| 337 | 
            +
                  "rewards/margins": 0.8904060125350952,
         | 
| 338 | 
            +
                  "rewards/rejected": -0.948272705078125,
         | 
| 339 | 
             
                  "step": 110
         | 
| 340 | 
             
                },
         | 
| 341 | 
             
                {
         | 
| 342 | 
            +
                  "epoch": 1.3199426111908177,
         | 
| 343 | 
            +
                  "grad_norm": 40.187136674467965,
         | 
| 344 | 
            +
                  "learning_rate": 7.151515151515152e-08,
         | 
| 345 | 
             
                  "logits/chosen": NaN,
         | 
| 346 | 
             
                  "logits/rejected": NaN,
         | 
| 347 | 
            +
                  "logps/chosen": -286.8125,
         | 
| 348 | 
            +
                  "logps/rejected": -305.38751220703125,
         | 
| 349 | 
            +
                  "loss": 0.5859,
         | 
| 350 | 
            +
                  "rewards/accuracies": 0.684374988079071,
         | 
| 351 | 
            +
                  "rewards/chosen": -0.05495605617761612,
         | 
| 352 | 
            +
                  "rewards/margins": 0.7184921503067017,
         | 
| 353 | 
            +
                  "rewards/rejected": -0.7738761901855469,
         | 
| 354 | 
             
                  "step": 115
         | 
| 355 | 
             
                },
         | 
| 356 | 
             
                {
         | 
| 357 | 
            +
                  "epoch": 1.3773314203730274,
         | 
| 358 | 
            +
                  "grad_norm": 62.26850922502401,
         | 
| 359 | 
            +
                  "learning_rate": 6.545454545454545e-08,
         | 
| 360 | 
             
                  "logits/chosen": NaN,
         | 
| 361 | 
            +
                  "logits/rejected": NaN,
         | 
| 362 | 
            +
                  "logps/chosen": -281.8828125,
         | 
| 363 | 
            +
                  "logps/rejected": -348.609375,
         | 
| 364 | 
            +
                  "loss": 0.5641,
         | 
| 365 | 
            +
                  "rewards/accuracies": 0.715624988079071,
         | 
| 366 | 
            +
                  "rewards/chosen": -0.0843501091003418,
         | 
| 367 | 
            +
                  "rewards/margins": 0.8600600957870483,
         | 
| 368 | 
            +
                  "rewards/rejected": -0.9451843500137329,
         | 
| 369 | 
             
                  "step": 120
         | 
| 370 | 
             
                },
         | 
| 371 | 
             
                {
         | 
| 372 | 
            +
                  "epoch": 1.4347202295552366,
         | 
| 373 | 
            +
                  "grad_norm": 40.00896779584937,
         | 
| 374 | 
            +
                  "learning_rate": 5.93939393939394e-08,
         | 
| 375 | 
             
                  "logits/chosen": NaN,
         | 
| 376 | 
             
                  "logits/rejected": NaN,
         | 
| 377 | 
            +
                  "logps/chosen": -289.64959716796875,
         | 
| 378 | 
            +
                  "logps/rejected": -378.01873779296875,
         | 
| 379 | 
            +
                  "loss": 0.5644,
         | 
| 380 | 
            +
                  "rewards/accuracies": 0.7562500238418579,
         | 
| 381 | 
            +
                  "rewards/chosen": -0.06678199768066406,
         | 
| 382 | 
            +
                  "rewards/margins": 0.8301635980606079,
         | 
| 383 | 
            +
                  "rewards/rejected": -0.8972938656806946,
         | 
| 384 | 
             
                  "step": 125
         | 
| 385 | 
             
                },
         | 
| 386 | 
             
                {
         | 
| 387 | 
            +
                  "epoch": 1.4921090387374463,
         | 
| 388 | 
            +
                  "grad_norm": 33.42224905779865,
         | 
| 389 | 
            +
                  "learning_rate": 5.333333333333333e-08,
         | 
| 390 | 
             
                  "logits/chosen": NaN,
         | 
| 391 | 
             
                  "logits/rejected": NaN,
         | 
| 392 | 
            +
                  "logps/chosen": -228.01406860351562,
         | 
| 393 | 
            +
                  "logps/rejected": -385.5078125,
         | 
| 394 | 
            +
                  "loss": 0.5231,
         | 
| 395 | 
             
                  "rewards/accuracies": 0.7593749761581421,
         | 
| 396 | 
            +
                  "rewards/chosen": -0.06652259826660156,
         | 
| 397 | 
            +
                  "rewards/margins": 1.106951117515564,
         | 
| 398 | 
            +
                  "rewards/rejected": -1.173893690109253,
         | 
| 399 | 
             
                  "step": 130
         | 
| 400 | 
             
                },
         | 
| 401 | 
             
                {
         | 
| 402 | 
            +
                  "epoch": 1.5494978479196555,
         | 
| 403 | 
            +
                  "grad_norm": 35.377034170414966,
         | 
| 404 | 
            +
                  "learning_rate": 4.727272727272727e-08,
         | 
| 405 | 
             
                  "logits/chosen": NaN,
         | 
| 406 | 
             
                  "logits/rejected": NaN,
         | 
| 407 | 
            +
                  "logps/chosen": -276.4312438964844,
         | 
| 408 | 
            +
                  "logps/rejected": -374.94219970703125,
         | 
| 409 | 
            +
                  "loss": 0.5493,
         | 
| 410 | 
            +
                  "rewards/accuracies": 0.753125011920929,
         | 
| 411 | 
            +
                  "rewards/chosen": -0.07003593444824219,
         | 
| 412 | 
            +
                  "rewards/margins": 0.8382889032363892,
         | 
| 413 | 
            +
                  "rewards/rejected": -0.9086562991142273,
         | 
| 414 | 
             
                  "step": 135
         | 
| 415 | 
             
                },
         | 
| 416 | 
             
                {
         | 
| 417 | 
            +
                  "epoch": 1.6068866571018652,
         | 
| 418 | 
            +
                  "grad_norm": 42.98104883723141,
         | 
| 419 | 
            +
                  "learning_rate": 4.121212121212121e-08,
         | 
| 420 | 
             
                  "logits/chosen": NaN,
         | 
| 421 | 
             
                  "logits/rejected": NaN,
         | 
| 422 | 
            +
                  "logps/chosen": -245.8046875,
         | 
| 423 | 
            +
                  "logps/rejected": -332.34844970703125,
         | 
| 424 | 
            +
                  "loss": 0.5553,
         | 
| 425 | 
            +
                  "rewards/accuracies": 0.731249988079071,
         | 
| 426 | 
            +
                  "rewards/chosen": -0.09261999279260635,
         | 
| 427 | 
            +
                  "rewards/margins": 0.8664749264717102,
         | 
| 428 | 
            +
                  "rewards/rejected": -0.9593642950057983,
         | 
| 429 | 
             
                  "step": 140
         | 
| 430 | 
             
                },
         | 
| 431 | 
             
                {
         | 
| 432 | 
            +
                  "epoch": 1.6642754662840746,
         | 
| 433 | 
            +
                  "grad_norm": 41.194723091835506,
         | 
| 434 | 
            +
                  "learning_rate": 3.5151515151515146e-08,
         | 
| 435 | 
             
                  "logits/chosen": NaN,
         | 
| 436 | 
            +
                  "logits/rejected": NaN,
         | 
| 437 | 
            +
                  "logps/chosen": -271.9007873535156,
         | 
| 438 | 
            +
                  "logps/rejected": -372.2890625,
         | 
| 439 | 
            +
                  "loss": 0.576,
         | 
| 440 | 
            +
                  "rewards/accuracies": 0.703125,
         | 
| 441 | 
            +
                  "rewards/chosen": -0.10253047943115234,
         | 
| 442 | 
            +
                  "rewards/margins": 0.6510879397392273,
         | 
| 443 | 
            +
                  "rewards/rejected": -0.7538429498672485,
         | 
| 444 | 
             
                  "step": 145
         | 
| 445 | 
             
                },
         | 
| 446 | 
             
                {
         | 
| 447 | 
            +
                  "epoch": 1.721664275466284,
         | 
| 448 | 
            +
                  "grad_norm": 44.50624051602098,
         | 
| 449 | 
            +
                  "learning_rate": 2.9090909090909088e-08,
         | 
| 450 | 
             
                  "logits/chosen": NaN,
         | 
| 451 | 
            +
                  "logits/rejected": -0.3272903561592102,
         | 
| 452 | 
            +
                  "logps/chosen": -241.9031219482422,
         | 
| 453 | 
            +
                  "logps/rejected": -336.61407470703125,
         | 
| 454 | 
            +
                  "loss": 0.557,
         | 
| 455 | 
            +
                  "rewards/accuracies": 0.699999988079071,
         | 
| 456 | 
            +
                  "rewards/chosen": -0.09629325568675995,
         | 
| 457 | 
            +
                  "rewards/margins": 0.7580966949462891,
         | 
| 458 | 
            +
                  "rewards/rejected": -0.8545807003974915,
         | 
| 459 | 
             
                  "step": 150
         | 
| 460 | 
             
                },
         | 
| 461 | 
             
                {
         | 
| 462 | 
            +
                  "epoch": 1.7790530846484935,
         | 
| 463 | 
            +
                  "grad_norm": 35.414071145451,
         | 
| 464 | 
            +
                  "learning_rate": 2.303030303030303e-08,
         | 
| 465 | 
             
                  "logits/chosen": NaN,
         | 
| 466 | 
             
                  "logits/rejected": NaN,
         | 
| 467 | 
            +
                  "logps/chosen": -249.95938110351562,
         | 
| 468 | 
            +
                  "logps/rejected": -301.9765625,
         | 
| 469 | 
            +
                  "loss": 0.5675,
         | 
| 470 | 
            +
                  "rewards/accuracies": 0.778124988079071,
         | 
| 471 | 
            +
                  "rewards/chosen": -0.08613376319408417,
         | 
| 472 | 
            +
                  "rewards/margins": 0.8244568109512329,
         | 
| 473 | 
            +
                  "rewards/rejected": -0.9111496210098267,
         | 
| 474 | 
             
                  "step": 155
         | 
| 475 | 
             
                },
         | 
| 476 | 
             
                {
         | 
| 477 | 
            +
                  "epoch": 1.836441893830703,
         | 
| 478 | 
            +
                  "grad_norm": 38.330459518863954,
         | 
| 479 | 
            +
                  "learning_rate": 1.696969696969697e-08,
         | 
| 480 | 
             
                  "logits/chosen": NaN,
         | 
| 481 | 
            +
                  "logits/rejected": -0.393698126077652,
         | 
| 482 | 
            +
                  "logps/chosen": -262.72186279296875,
         | 
| 483 | 
            +
                  "logps/rejected": -394.54217529296875,
         | 
| 484 | 
            +
                  "loss": 0.5666,
         | 
| 485 | 
            +
                  "rewards/accuracies": 0.71875,
         | 
| 486 | 
            +
                  "rewards/chosen": -0.09622383117675781,
         | 
| 487 | 
            +
                  "rewards/margins": 0.8662067651748657,
         | 
| 488 | 
            +
                  "rewards/rejected": -0.9627344012260437,
         | 
| 489 | 
             
                  "step": 160
         | 
| 490 | 
             
                },
         | 
| 491 | 
             
                {
         | 
| 492 | 
            +
                  "epoch": 1.8938307030129125,
         | 
| 493 | 
            +
                  "grad_norm": 41.39148819542555,
         | 
| 494 | 
            +
                  "learning_rate": 1.0909090909090908e-08,
         | 
| 495 | 
             
                  "logits/chosen": NaN,
         | 
| 496 | 
            +
                  "logits/rejected": -0.3541931211948395,
         | 
| 497 | 
            +
                  "logps/chosen": -286.5625,
         | 
| 498 | 
            +
                  "logps/rejected": -391.0406188964844,
         | 
| 499 | 
            +
                  "loss": 0.5354,
         | 
| 500 | 
            +
                  "rewards/accuracies": 0.75,
         | 
| 501 | 
            +
                  "rewards/chosen": -0.07141885906457901,
         | 
| 502 | 
            +
                  "rewards/margins": 1.0486961603164673,
         | 
| 503 | 
            +
                  "rewards/rejected": -1.120294213294983,
         | 
| 504 | 
             
                  "step": 165
         | 
| 505 | 
             
                },
         | 
| 506 | 
             
                {
         | 
| 507 | 
            +
                  "epoch": 1.951219512195122,
         | 
| 508 | 
            +
                  "grad_norm": 39.2547263324904,
         | 
| 509 | 
            +
                  "learning_rate": 4.848484848484848e-09,
         | 
| 510 | 
             
                  "logits/chosen": NaN,
         | 
| 511 | 
            +
                  "logits/rejected": -0.39358216524124146,
         | 
| 512 | 
            +
                  "logps/chosen": -267.47265625,
         | 
| 513 | 
            +
                  "logps/rejected": -372.5625,
         | 
| 514 | 
            +
                  "loss": 0.5478,
         | 
| 515 | 
            +
                  "rewards/accuracies": 0.706250011920929,
         | 
| 516 | 
            +
                  "rewards/chosen": -0.05818195268511772,
         | 
| 517 | 
            +
                  "rewards/margins": 1.0078842639923096,
         | 
| 518 | 
            +
                  "rewards/rejected": -1.065637230873108,
         | 
| 519 | 
             
                  "step": 170
         | 
| 520 | 
             
                },
         | 
| 521 | 
             
                {
         | 
| 522 | 
            +
                  "epoch": 1.9971305595408895,
         | 
| 523 | 
            +
                  "step": 174,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 524 | 
             
                  "total_flos": 0.0,
         | 
| 525 | 
            +
                  "train_loss": 0.5929731760901966,
         | 
| 526 | 
            +
                  "train_runtime": 2030.2211,
         | 
| 527 | 
            +
                  "train_samples_per_second": 5.493,
         | 
| 528 | 
            +
                  "train_steps_per_second": 0.086
         | 
| 529 | 
             
                }
         | 
| 530 | 
             
              ],
         | 
| 531 | 
             
              "logging_steps": 5,
         | 
| 532 | 
            +
              "max_steps": 174,
         | 
| 533 | 
             
              "num_input_tokens_seen": 0,
         | 
| 534 | 
             
              "num_train_epochs": 2,
         | 
| 535 | 
             
              "save_steps": 50,
         | 
    	
        training_args.bin
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 7672
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2c63478df6f2cc0a8edc66dba8ca06784108c3576df8e676705056d7247719f8
         | 
| 3 | 
             
            size 7672
         | 
