Zaynes commited on
Commit
2203975
·
verified ·
1 Parent(s): 446c818

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,8 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Mark all log files as text to prevent binary file issues
2
+ *.log text
3
+ *.txt text
4
+ *.out text
5
+ *.err text
6
+ training_artifacts/logs/* text
7
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
8
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modelfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ollama modelfile auto-generated by llamafactory
2
+
3
+ FROM .
4
+
5
+ TEMPLATE """{{ if .System }}<|im_start|>system
6
+ {{ .System }}<|im_end|>
7
+ {{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
8
+ {{ .Content }}<|im_end|>
9
+ <|im_start|>assistant
10
+ {{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
11
+ {{ end }}{{ end }}"""
12
+
13
+ SYSTEM """You are Qwen, created by Alibaba Cloud. You are a helpful assistant."""
14
+
15
+ PARAMETER stop "<|im_end|>"
16
+ PARAMETER num_ctx 4096
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention"
42
+ ],
43
+ "max_position_embeddings": 32768,
44
+ "max_window_layers": 21,
45
+ "model_type": "qwen2",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 28,
48
+ "num_key_value_heads": 2,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.1",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151936
58
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.57.1"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667e91af8fd2f29b75aae0c0d91510f0fb65cd6ca6ae30755afad87f358c287
3
+ size 3087467144
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "left",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
training_artifacts/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Artifacts
2
+
3
+ This directory contains the training configuration and logs for this model.
4
+
5
+ ## Contents
6
+
7
+ - **hydra_config.yaml**: Complete Hydra configuration used for training
8
+ - **train_config.yaml**: LlamaFactory training configuration
9
+ - **merge_config.yaml**: LlamaFactory merge/export configuration
10
+ - **logs/**: Training logs from the job (cleaned for text format)
11
+
12
+ ## Job Information
13
+
14
+ - Job Name: testing__pvv2_lora
15
+ - Timestamp: 2025-10-25 03:56:22 UTC
16
+ - Execution Mode: Local
training_artifacts/hydra_config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: null
2
+ job:
3
+ name: testing__pvv2_lora
4
+ mode: local
5
+ dry_run: false
6
+ work_dir: null
7
+ slurm:
8
+ time_limit: null
9
+ constraint: null
10
+ memory: null
11
+ cpus_per_task: 16
12
+ partition: null
13
+ mail_user: null
14
+ execution:
15
+ nodes: 1
16
+ gpus_per_node: 2
17
+ num_gpus: null
18
+ hostfile: null
19
+ secrets_file: ./secrets.env
20
+ model:
21
+ name_or_path: Qwen/Qwen2.5-1.5B-Instruct
22
+ finetuning_type: lora
23
+ dataset:
24
+ name: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
25
+ dir: null
26
+ info_json: null
27
+ template: qwen
28
+ cutoff_len: 16192
29
+ val_size: 0.0
30
+ tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
31
+ hf_hub_url: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
32
+ formatting: sharegpt
33
+ ranking: false
34
+ subset: null
35
+ split: train
36
+ folder: null
37
+ num_samples: null
38
+ columns:
39
+ prompt: null
40
+ query: null
41
+ response: null
42
+ history: null
43
+ messages: conversations
44
+ system: null
45
+ tools: null
46
+ images: null
47
+ videos: null
48
+ audios: null
49
+ chosen: null
50
+ rejected: null
51
+ kto_tag: null
52
+ tags:
53
+ role: role
54
+ content: content
55
+ user: user
56
+ assistant: assistant
57
+ observation: null
58
+ function: null
59
+ system: null
60
+ output:
61
+ experiment_dir: ./experiments
62
+ wandb:
63
+ project: null
64
+ run_name: testing__pvv2_lora
65
+ entity: null
66
+ hf:
67
+ repo_id: TAUR-dev/testing__pvv2_lora
68
+ private: false
69
+ upload_artifacts: true
70
+ cleanup:
71
+ checkpoints: false
72
+ merged: false
73
+ training:
74
+ stage: sft
75
+ do_train: true
76
+ max_samples: 100000
77
+ do_eval: false
78
+ save_strategy: steps
79
+ save_steps: 5
80
+ logging_steps: 10
81
+ fp16: false
82
+ bf16: true
83
+ adam_beta1: 0.9
84
+ adam_beta2: 0.95
85
+ overwrite_output_dir: true
86
+ per_device_train_batch_size: 1
87
+ gradient_accumulation_steps: 1
88
+ gradient_checkpointing: true
89
+ learning_rate: 1.0e-06
90
+ lr_scheduler_type: cosine
91
+ num_train_epochs: 2
92
+ warmup_ratio: 0.05
93
+ weight_decay: 0.0001
94
+ template: qwen
95
+ max_steps: 10
96
+ preprocessing_num_workers: 16
97
+ overwrite_cache: true
98
+ finetuning:
99
+ training:
100
+ stage: sft
101
+ do_train: true
102
+ finetuning_type: lora
103
+ lora_rank: 8
104
+ lora_alpha: 16
105
+ lora_dropout: 0.05
106
+ lora_target: all
107
+ overwrite_cache: true
108
+ preprocessing_num_workers: 16
109
+ dataloader_num_workers: 4
110
+ logging_steps: 10
111
+ save_steps: 500
112
+ plot_loss: true
113
+ overwrite_output_dir: true
114
+ save_only_model: false
115
+ report_to: none
116
+ per_device_train_batch_size: 1
117
+ gradient_accumulation_steps: 8
118
+ learning_rate: 0.0001
119
+ num_train_epochs: 3.0
120
+ lr_scheduler_type: cosine
121
+ warmup_ratio: 0.1
122
+ bf16: true
123
+ ddp_timeout: 180000000
124
+ resume_from_checkpoint: null
125
+ val_size: 0.1
126
+ per_device_eval_batch_size: 1
127
+ eval_strategy: steps
128
+ eval_steps: 500
129
+ do_eval: true
130
+ merge:
131
+ export_dir: null
132
+ export_size: 5
133
+ export_device: cpu
134
+ export_legacy_format: false
training_artifacts/logs/pipeline_cleaned.txt ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-24 23:55:28] ========================================
2
+ [2025-10-24 23:55:28] Job Name: testing__pvv2_lora
3
+ [2025-10-24 23:55:28] Hostname: gl007.hpc.nyu.edu
4
+ [2025-10-24 23:55:28] Number of nodes: 1
5
+ [2025-10-24 23:55:28] GPUs per node: 2
6
+ [2025-10-24 23:55:28] Start Time: Fri Oct 24 11:55:28 PM EDT 2025
7
+ [2025-10-24 23:55:28] Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/logs/pipeline.log
8
+ [2025-10-24 23:55:28] ========================================
9
+ [2025-10-24 23:55:28] Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
10
+ [2025-10-24 23:55:30]
11
+ [2025-10-24 23:55:30] ========================================
12
+ [2025-10-24 23:55:30] Configuration Paths
13
+ [2025-10-24 23:55:30] ========================================
14
+ [2025-10-24 23:55:30] Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/configs/train_config.yaml
15
+ [2025-10-24 23:55:30] Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/configs/merge_config.yaml
16
+ [2025-10-24 23:55:30] Dataset Info: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data/dataset_info.json
17
+ [2025-10-24 23:55:30] Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints
18
+ [2025-10-24 23:55:30] Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged
19
+ [2025-10-24 23:55:30] HF Repo ID: TAUR-dev/testing__pvv2_lora
20
+ [2025-10-24 23:55:30]
21
+ [make-effective-cfg] tokenized_path: /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3
22
+ [make-effective-cfg] wrote: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/logs/train_config.effective.yaml
23
+ [2025-10-24 23:55:30]
24
+ [2025-10-24 23:55:30] ========================================
25
+ [2025-10-24 23:55:30] STAGE 0: Downloading Dataset
26
+ [2025-10-24 23:55:30] Dataset: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
27
+ [2025-10-24 23:55:30] Start Time: Fri Oct 24 11:55:30 PM EDT 2025
28
+ [2025-10-24 23:55:30] ========================================
29
+ [dataset-download] Loading dataset from: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
30
+ [dataset-download] Dataset loaded successfully
31
+ [dataset-download] Dataset info: DatasetDict({
32
+ train: Dataset({
33
+ features: ['conversations', 'sft_template_type_idx'],
34
+ num_rows: 29130
35
+ })
36
+ })
37
+ [2025-10-24 23:55:32]
38
+ [2025-10-24 23:55:32] ========================================
39
+ [2025-10-24 23:55:32] Dataset download completed
40
+ [2025-10-24 23:55:32] End Time: Fri Oct 24 11:55:32 PM EDT 2025
41
+ [2025-10-24 23:55:32] ========================================
42
+ [2025-10-24 23:55:32]
43
+ [2025-10-24 23:55:32] ========================================
44
+ [2025-10-24 23:55:32] STAGE 1: Training Model
45
+ [2025-10-24 23:55:32] Start Time: Fri Oct 24 11:55:32 PM EDT 2025
46
+ [2025-10-24 23:55:32] ========================================
47
+ [2025-10-24 23:55:32] Job: testing__pvv2_lora
48
+ [2025-10-24 23:55:32] Nodes: 1 | GPUs/node: 2
49
+ [2025-10-24 23:55:32] Master: 127.0.0.1:29500
50
+ [2025-10-24 23:55:32] LLaMA-Factory: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
51
+ [2025-10-24 23:55:32] Train cfg (effective): /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/logs/train_config.effective.yaml
52
+ [2025-10-24 23:55:32] HF cache: /scratch/zrs2020/.cache/hf_cache/home/datasets
53
+ [2025-10-24 23:55:32] Launcher: torchrun
54
+ [2025-10-24 23:55:32]
55
+ [2025-10-24 23:55:32] Single-node training (2 GPU(s))
56
+ [2025-10-24 23:55:32] Executing command: llamafactory-cli train /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/logs/train_config.effective.yaml
57
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
58
+ warnings.warn(
59
+ [INFO|2025-10-24 23:55:40] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:29500
60
+ W1024 23:55:41.864000 3022854 site-packages/torch/distributed/run.py:803]
61
+ W1024 23:55:41.864000 3022854 site-packages/torch/distributed/run.py:803] *****************************************
62
+ W1024 23:55:41.864000 3022854 site-packages/torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
63
+ W1024 23:55:41.864000 3022854 site-packages/torch/distributed/run.py:803] *****************************************
64
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
65
+ warnings.warn(
66
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
67
+ warnings.warn(
68
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
69
+ import pkg_resources
70
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
71
+ import pkg_resources
72
+ [W1024 23:55:50.757874363 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
73
+ [W1024 23:55:50.757887679 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
74
+ [INFO|2025-10-24 23:55:50] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
75
+ [INFO|2025-10-24 23:55:50] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
76
+ [INFO|2025-10-24 23:55:50] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16
77
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,441 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
78
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
79
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
80
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file added_tokens.json from cache at None
81
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file special_tokens_map.json from cache at None
82
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
83
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,442 >> loading file chat_template.jinja from cache at None
84
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:55:50,609 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
85
+ [INFO|configuration_utils.py:765] 2025-10-24 23:55:50,826 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
86
+ [INFO|configuration_utils.py:839] 2025-10-24 23:55:50,828 >> Model config Qwen2Config {
87
+ "architectures": [
88
+ "Qwen2ForCausalLM"
89
+ ],
90
+ "attention_dropout": 0.0,
91
+ "bos_token_id": 151643,
92
+ "dtype": "bfloat16",
93
+ "eos_token_id": 151645,
94
+ "hidden_act": "silu",
95
+ "hidden_size": 1536,
96
+ "initializer_range": 0.02,
97
+ "intermediate_size": 8960,
98
+ "layer_types": [
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention",
113
+ "full_attention",
114
+ "full_attention",
115
+ "full_attention",
116
+ "full_attention",
117
+ "full_attention",
118
+ "full_attention",
119
+ "full_attention",
120
+ "full_attention",
121
+ "full_attention",
122
+ "full_attention",
123
+ "full_attention",
124
+ "full_attention",
125
+ "full_attention",
126
+ "full_attention"
127
+ ],
128
+ "max_position_embeddings": 32768,
129
+ "max_window_layers": 21,
130
+ "model_type": "qwen2",
131
+ "num_attention_heads": 12,
132
+ "num_hidden_layers": 28,
133
+ "num_key_value_heads": 2,
134
+ "rms_norm_eps": 1e-06,
135
+ "rope_scaling": null,
136
+ "rope_theta": 1000000.0,
137
+ "sliding_window": null,
138
+ "tie_word_embeddings": true,
139
+ "transformers_version": "4.57.1",
140
+ "use_cache": true,
141
+ "use_sliding_window": false,
142
+ "vocab_size": 151936
143
+ }
144
+
145
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
146
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
147
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
148
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file added_tokens.json from cache at None
149
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file special_tokens_map.json from cache at None
150
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
151
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:55:50,899 >> loading file chat_template.jinja from cache at None
152
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:55:51,063 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
153
+ [WARNING|2025-10-24 23:55:51] llamafactory.data.loader:148 >> Loading dataset from disk will ignore other data arguments.
154
+ [INFO|2025-10-24 23:55:51] llamafactory.data.loader:143 >> Loaded tokenized dataset from /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3.
155
+ [INFO|configuration_utils.py:765] 2025-10-24 23:55:51,138 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
156
+ [INFO|configuration_utils.py:839] 2025-10-24 23:55:51,138 >> Model config Qwen2Config {
157
+ "architectures": [
158
+ "Qwen2ForCausalLM"
159
+ ],
160
+ "attention_dropout": 0.0,
161
+ "bos_token_id": 151643,
162
+ "dtype": "bfloat16",
163
+ "eos_token_id": 151645,
164
+ "hidden_act": "silu",
165
+ "hidden_size": 1536,
166
+ "initializer_range": 0.02,
167
+ "intermediate_size": 8960,
168
+ "layer_types": [
169
+ "full_attention",
170
+ "full_attention",
171
+ "full_attention",
172
+ "full_attention",
173
+ "full_attention",
174
+ "full_attention",
175
+ "full_attention",
176
+ "full_attention",
177
+ "full_attention",
178
+ "full_attention",
179
+ "full_attention",
180
+ "full_attention",
181
+ "full_attention",
182
+ "full_attention",
183
+ "full_attention",
184
+ "full_attention",
185
+ "full_attention",
186
+ "full_attention",
187
+ "full_attention",
188
+ "full_attention",
189
+ "full_attention",
190
+ "full_attention",
191
+ "full_attention",
192
+ "full_attention",
193
+ "full_attention",
194
+ "full_attention",
195
+ "full_attention",
196
+ "full_attention"
197
+ ],
198
+ "max_position_embeddings": 32768,
199
+ "max_window_layers": 21,
200
+ "model_type": "qwen2",
201
+ "num_attention_heads": 12,
202
+ "num_hidden_layers": 28,
203
+ "num_key_value_heads": 2,
204
+ "rms_norm_eps": 1e-06,
205
+ "rope_scaling": null,
206
+ "rope_theta": 1000000.0,
207
+ "sliding_window": null,
208
+ "tie_word_embeddings": true,
209
+ "transformers_version": "4.57.1",
210
+ "use_cache": true,
211
+ "use_sliding_window": false,
212
+ "vocab_size": 151936
213
+ }
214
+
215
+ [INFO|2025-10-24 23:55:51] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
216
+ [WARNING|logging.py:328] 2025-10-24 23:55:51,492 >> `torch_dtype` is deprecated! Use `dtype` instead!
217
+ [INFO|modeling_utils.py:1172] 2025-10-24 23:55:51,493 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/model.safetensors
218
+ [INFO|modeling_utils.py:2341] 2025-10-24 23:55:51,494 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
219
+ [INFO|configuration_utils.py:986] 2025-10-24 23:55:51,495 >> Generate config GenerationConfig {
220
+ "bos_token_id": 151643,
221
+ "eos_token_id": 151645,
222
+ "use_cache": false
223
+ }
224
+
225
+ `torch_dtype` is deprecated! Use `dtype` instead!
226
+ [INFO|configuration_utils.py:941] 2025-10-24 23:55:52,421 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/generation_config.json
227
+ [INFO|configuration_utils.py:986] 2025-10-24 23:55:52,421 >> Generate config GenerationConfig {
228
+ "bos_token_id": 151643,
229
+ "do_sample": true,
230
+ "eos_token_id": [
231
+ 151645,
232
+ 151643
233
+ ],
234
+ "pad_token_id": 151643,
235
+ "repetition_penalty": 1.1,
236
+ "temperature": 0.7,
237
+ "top_k": 20,
238
+ "top_p": 0.8
239
+ }
240
+
241
+ [INFO|dynamic_module_utils.py:423] 2025-10-24 23:55:52,453 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-1.5B-Instruct.
242
+ [INFO|2025-10-24 23:55:52] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
243
+ [INFO|2025-10-24 23:55:52] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
244
+ [INFO|2025-10-24 23:55:52] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
245
+ [INFO|2025-10-24 23:55:52] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
246
+ [INFO|2025-10-24 23:55:52] llamafactory.model.model_utils.misc:143 >> Found linear modules: o_proj,gate_proj,q_proj,down_proj,v_proj,k_proj,up_proj
247
+ [INFO|2025-10-24 23:55:52] llamafactory.model.loader:143 >> trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945
248
+ [WARNING|trainer.py:906] 2025-10-24 23:55:52,738 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
249
+ [INFO|trainer.py:699] 2025-10-24 23:55:52,740 >> max_steps is given, it will override any value given in num_train_epochs
250
+ [INFO|trainer.py:749] 2025-10-24 23:55:52,740 >> Using auto half precision backend
251
+ [WARNING|trainer.py:982] 2025-10-24 23:55:52,742 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
252
+ The model is already on multiple devices. Skipping the move to device specified in `args`.
253
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
254
+ NCCL version 2.27.5+cuda12.9
255
+ [INFO|trainer.py:2519] 2025-10-24 23:55:53,120 >> ***** Running training *****
256
+ [INFO|trainer.py:2520] 2025-10-24 23:55:53,120 >> Num examples = 29,130
257
+ [INFO|trainer.py:2521] 2025-10-24 23:55:53,120 >> Num Epochs = 1
258
+ [INFO|trainer.py:2522] 2025-10-24 23:55:53,120 >> Instantaneous batch size per device = 1
259
+ [INFO|trainer.py:2525] 2025-10-24 23:55:53,120 >> Total train batch size (w. parallel, distributed & accumulation) = 2
260
+ [INFO|trainer.py:2526] 2025-10-24 23:55:53,120 >> Gradient Accumulation steps = 1
261
+ [INFO|trainer.py:2527] 2025-10-24 23:55:53,120 >> Total optimization steps = 10
262
+ [INFO|trainer.py:2528] 2025-10-24 23:55:53,122 >> Number of trainable parameters = 9,232,384
263
+ [INFO|integration_utils.py:867] 2025-10-24 23:55:53,220 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
264
+ wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
265
+ wandb: Tracking run with wandb version 0.22.2
266
+ wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251024_235553-oqx8ngeo
267
+ wandb: Run `wandb offline` to turn off syncing.
268
+ wandb: Syncing run testing__pvv2_lora
269
+ wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
270
+ wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/oqx8ngeo
271
+ 0%| | 0/10 [00:00<?, ?it/s] 10%| | 1/10 [00:01<00:10, 1.22s/it] 20%| | 2/10 [00:01<00:06, 1.27it/s] 30%| | 3/10 [00:02<00:04, 1.68it/s] 40%| | 4/10 [00:03<00:04, 1.37it/s] 50%| | 5/10 [00:03<00:03, 1.49it/s][INFO|trainer.py:4309] 2025-10-24 23:55:57,737 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-5
272
+ [INFO|configuration_utils.py:765] 2025-10-24 23:55:57,839 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
273
+ [INFO|configuration_utils.py:839] 2025-10-24 23:55:57,840 >> Model config Qwen2Config {
274
+ "architectures": [
275
+ "Qwen2ForCausalLM"
276
+ ],
277
+ "attention_dropout": 0.0,
278
+ "bos_token_id": 151643,
279
+ "dtype": "bfloat16",
280
+ "eos_token_id": 151645,
281
+ "hidden_act": "silu",
282
+ "hidden_size": 1536,
283
+ "initializer_range": 0.02,
284
+ "intermediate_size": 8960,
285
+ "layer_types": [
286
+ "full_attention",
287
+ "full_attention",
288
+ "full_attention",
289
+ "full_attention",
290
+ "full_attention",
291
+ "full_attention",
292
+ "full_attention",
293
+ "full_attention",
294
+ "full_attention",
295
+ "full_attention",
296
+ "full_attention",
297
+ "full_attention",
298
+ "full_attention",
299
+ "full_attention",
300
+ "full_attention",
301
+ "full_attention",
302
+ "full_attention",
303
+ "full_attention",
304
+ "full_attention",
305
+ "full_attention",
306
+ "full_attention",
307
+ "full_attention",
308
+ "full_attention",
309
+ "full_attention",
310
+ "full_attention",
311
+ "full_attention",
312
+ "full_attention",
313
+ "full_attention"
314
+ ],
315
+ "max_position_embeddings": 32768,
316
+ "max_window_layers": 21,
317
+ "model_type": "qwen2",
318
+ "num_attention_heads": 12,
319
+ "num_hidden_layers": 28,
320
+ "num_key_value_heads": 2,
321
+ "rms_norm_eps": 1e-06,
322
+ "rope_scaling": null,
323
+ "rope_theta": 1000000.0,
324
+ "sliding_window": null,
325
+ "tie_word_embeddings": true,
326
+ "transformers_version": "4.57.1",
327
+ "use_cache": true,
328
+ "use_sliding_window": false,
329
+ "vocab_size": 151936
330
+ }
331
+
332
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:55:58,067 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-5/chat_template.jinja
333
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:55:58,072 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-5/tokenizer_config.json
334
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:55:58,076 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-5/special_tokens_map.json
335
+ 60%| | 6/10 [00:05<00:04, 1.18s/it] 70%| | 7/10 [00:06<00:02, 1.11it/s] 80%| | 8/10 [00:06<00:01, 1.23it/s] 90%| | 9/10 [00:07<00:00, 1.45it/s]100%|| 10/10 [00:08<00:00, 1.23it/s] {'loss': 0.7188, 'grad_norm': 0.2177160233259201, 'learning_rate': 3.015368960704584e-08, 'epoch': 0.0}
336
+ 100%|| 10/10 [00:08<00:00, 1.23it/s][INFO|trainer.py:4309] 2025-10-24 23:56:02,371 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
337
+ [INFO|configuration_utils.py:765] 2025-10-24 23:56:02,490 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
338
+ [INFO|configuration_utils.py:839] 2025-10-24 23:56:02,491 >> Model config Qwen2Config {
339
+ "architectures": [
340
+ "Qwen2ForCausalLM"
341
+ ],
342
+ "attention_dropout": 0.0,
343
+ "bos_token_id": 151643,
344
+ "dtype": "bfloat16",
345
+ "eos_token_id": 151645,
346
+ "hidden_act": "silu",
347
+ "hidden_size": 1536,
348
+ "initializer_range": 0.02,
349
+ "intermediate_size": 8960,
350
+ "layer_types": [
351
+ "full_attention",
352
+ "full_attention",
353
+ "full_attention",
354
+ "full_attention",
355
+ "full_attention",
356
+ "full_attention",
357
+ "full_attention",
358
+ "full_attention",
359
+ "full_attention",
360
+ "full_attention",
361
+ "full_attention",
362
+ "full_attention",
363
+ "full_attention",
364
+ "full_attention",
365
+ "full_attention",
366
+ "full_attention",
367
+ "full_attention",
368
+ "full_attention",
369
+ "full_attention",
370
+ "full_attention",
371
+ "full_attention",
372
+ "full_attention",
373
+ "full_attention",
374
+ "full_attention",
375
+ "full_attention",
376
+ "full_attention",
377
+ "full_attention",
378
+ "full_attention"
379
+ ],
380
+ "max_position_embeddings": 32768,
381
+ "max_window_layers": 21,
382
+ "model_type": "qwen2",
383
+ "num_attention_heads": 12,
384
+ "num_hidden_layers": 28,
385
+ "num_key_value_heads": 2,
386
+ "rms_norm_eps": 1e-06,
387
+ "rope_scaling": null,
388
+ "rope_theta": 1000000.0,
389
+ "sliding_window": null,
390
+ "tie_word_embeddings": true,
391
+ "transformers_version": "4.57.1",
392
+ "use_cache": true,
393
+ "use_sliding_window": false,
394
+ "vocab_size": 151936
395
+ }
396
+
397
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:56:02,701 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10/chat_template.jinja
398
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:56:02,706 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10/tokenizer_config.json
399
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:56:02,710 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10/special_tokens_map.json
400
+ [INFO|trainer.py:2810] 2025-10-24 23:56:03,258 >>
401
+
402
+ Training completed. Do not forget to share your model on huggingface.co/models =)
403
+
404
+
405
+ {'train_runtime': 10.137, 'train_samples_per_second': 1.973, 'train_steps_per_second': 0.986, 'train_loss': 0.718793535232544, 'epoch': 0.0}
406
+ 100%|| 10/10 [00:09<00:00, 1.23it/s]100%|| 10/10 [00:09<00:00, 1.10it/s]
407
+ [INFO|trainer.py:4309] 2025-10-24 23:56:03,267 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints
408
+ [INFO|configuration_utils.py:765] 2025-10-24 23:56:03,356 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
409
+ [INFO|configuration_utils.py:839] 2025-10-24 23:56:03,357 >> Model config Qwen2Config {
410
+ "architectures": [
411
+ "Qwen2ForCausalLM"
412
+ ],
413
+ "attention_dropout": 0.0,
414
+ "bos_token_id": 151643,
415
+ "dtype": "bfloat16",
416
+ "eos_token_id": 151645,
417
+ "hidden_act": "silu",
418
+ "hidden_size": 1536,
419
+ "initializer_range": 0.02,
420
+ "intermediate_size": 8960,
421
+ "layer_types": [
422
+ "full_attention",
423
+ "full_attention",
424
+ "full_attention",
425
+ "full_attention",
426
+ "full_attention",
427
+ "full_attention",
428
+ "full_attention",
429
+ "full_attention",
430
+ "full_attention",
431
+ "full_attention",
432
+ "full_attention",
433
+ "full_attention",
434
+ "full_attention",
435
+ "full_attention",
436
+ "full_attention",
437
+ "full_attention",
438
+ "full_attention",
439
+ "full_attention",
440
+ "full_attention",
441
+ "full_attention",
442
+ "full_attention",
443
+ "full_attention",
444
+ "full_attention",
445
+ "full_attention",
446
+ "full_attention",
447
+ "full_attention",
448
+ "full_attention",
449
+ "full_attention"
450
+ ],
451
+ "max_position_embeddings": 32768,
452
+ "max_window_layers": 21,
453
+ "model_type": "qwen2",
454
+ "num_attention_heads": 12,
455
+ "num_hidden_layers": 28,
456
+ "num_key_value_heads": 2,
457
+ "rms_norm_eps": 1e-06,
458
+ "rope_scaling": null,
459
+ "rope_theta": 1000000.0,
460
+ "sliding_window": null,
461
+ "tie_word_embeddings": true,
462
+ "transformers_version": "4.57.1",
463
+ "use_cache": true,
464
+ "use_sliding_window": false,
465
+ "vocab_size": 151936
466
+ }
467
+
468
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:56:03,588 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/chat_template.jinja
469
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:56:03,592 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/tokenizer_config.json
470
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:56:03,596 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/special_tokens_map.json
471
+ ***** train metrics *****
472
+ epoch = 0.0007
473
+ total_flos = 414519GF
474
+ train_loss = 0.7188
475
+ train_runtime = 0:00:10.13
476
+ train_samples_per_second = 1.973
477
+ train_steps_per_second = 0.986
478
+ [INFO|modelcard.py:456] 2025-10-24 23:56:03,838 >> Dropping the following result as it does not have all the necessary fields:
479
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
480
+ [W1024 23:56:04.029787829 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
481
+ [1;34mwandb[0m:
482
+ [1;34mwandb[0m: View run [33mtesting__pvv2_lora[0m at: [34m[0m
483
+ [1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251024_235553-oqx8ngeo/logs[0m
484
+ [W1024 23:56:05.730735839 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
485
+ [W1024 23:56:05.132682733 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
486
+ [W1024 23:56:05.555229777 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
487
+ [2025-10-24 23:56:06]
488
+ [2025-10-24 23:56:06] ========================================
489
+ [2025-10-24 23:56:06] Training completed successfully
490
+ [2025-10-24 23:56:06] End Time: Fri Oct 24 11:56:06 PM EDT 2025
491
+ [2025-10-24 23:56:06] ========================================
492
+ [2025-10-24 23:56:06]
493
+ [2025-10-24 23:56:06] ========================================
494
+ [2025-10-24 23:56:06] STAGE 2: Merging/Exporting Model
495
+ [2025-10-24 23:56:06] Start Time: Fri Oct 24 11:56:06 PM EDT 2025
496
+ [2025-10-24 23:56:06] ========================================
497
+ [2025-10-24 23:56:06] Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints
498
+ [2025-10-24 23:56:06] Analyzing checkpoints to find the one from current training run...
499
+ [2025-10-24 23:56:06] - checkpoint-10: trainer_state.json modified at Fri Oct 24 11:56:03 PM EDT 2025
500
+ [2025-10-24 23:56:06] - checkpoint-5: trainer_state.json modified at Fri Oct 24 11:55:58 PM EDT 2025
501
+ [2025-10-24 23:56:06]
502
+ [2025-10-24 23:56:06] Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
503
+ [2025-10-24 23:56:06] This checkpoint has the most recently updated trainer_state.json
504
+ [2025-10-24 23:56:06] Checkpoint details:
505
+ [2025-10-24 23:56:06] Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
506
+ [2025-10-24 23:56:06] Last modified: 2025-10-24 23:56:03.255712120 -0400
507
+ [2025-10-24 23:56:06] Training step: 10
508
+ [2025-10-24 23:56:06] Updating merge config to point to checkpoint...
509
+ Successfully updated merge config
510
+ [2025-10-24 23:56:06] Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
511
+ [2025-10-24 23:56:06]
512
+ [2025-10-24 23:56:06] Merge config contents:
513
+ [2025-10-24 23:56:06] template: qwen
514
+ [2025-10-24 23:56:06] trust_remote_code: true
515
+ [2025-10-24 23:56:06] export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged
516
+ [2025-10-24 23:56:06] model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
517
+ [2025-10-24 23:56:06] adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
518
+ [2025-10-24 23:56:06]
519
+ [2025-10-24 23:56:06] Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/configs/merge_config.yaml
520
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
521
+ warnings.warn(
522
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
523
+ import pkg_resources
524
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,985 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
525
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
526
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
527
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file added_tokens.json from cache at None
528
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file special_tokens_map.json from cache at None
529
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
530
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:13,986 >> loading file chat_template.jinja from cache at None
531
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:56:14,157 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
532
+ [INFO|configuration_utils.py:765] 2025-10-24 23:56:14,372 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
533
+ [INFO|configuration_utils.py:839] 2025-10-24 23:56:14,374 >> Model config Qwen2Config {
534
+ "architectures": [
535
+ "Qwen2ForCausalLM"
536
+ ],
537
+ "attention_dropout": 0.0,
538
+ "bos_token_id": 151643,
539
+ "dtype": "bfloat16",
540
+ "eos_token_id": 151645,
541
+ "hidden_act": "silu",
542
+ "hidden_size": 1536,
543
+ "initializer_range": 0.02,
544
+ "intermediate_size": 8960,
545
+ "layer_types": [
546
+ "full_attention",
547
+ "full_attention",
548
+ "full_attention",
549
+ "full_attention",
550
+ "full_attention",
551
+ "full_attention",
552
+ "full_attention",
553
+ "full_attention",
554
+ "full_attention",
555
+ "full_attention",
556
+ "full_attention",
557
+ "full_attention",
558
+ "full_attention",
559
+ "full_attention",
560
+ "full_attention",
561
+ "full_attention",
562
+ "full_attention",
563
+ "full_attention",
564
+ "full_attention",
565
+ "full_attention",
566
+ "full_attention",
567
+ "full_attention",
568
+ "full_attention",
569
+ "full_attention",
570
+ "full_attention",
571
+ "full_attention",
572
+ "full_attention",
573
+ "full_attention"
574
+ ],
575
+ "max_position_embeddings": 32768,
576
+ "max_window_layers": 21,
577
+ "model_type": "qwen2",
578
+ "num_attention_heads": 12,
579
+ "num_hidden_layers": 28,
580
+ "num_key_value_heads": 2,
581
+ "rms_norm_eps": 1e-06,
582
+ "rope_scaling": null,
583
+ "rope_theta": 1000000.0,
584
+ "sliding_window": null,
585
+ "tie_word_embeddings": true,
586
+ "transformers_version": "4.57.1",
587
+ "use_cache": true,
588
+ "use_sliding_window": false,
589
+ "vocab_size": 151936
590
+ }
591
+
592
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
593
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
594
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
595
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file added_tokens.json from cache at None
596
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file special_tokens_map.json from cache at None
597
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
598
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:56:14,443 >> loading file chat_template.jinja from cache at None
599
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:56:14,608 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
600
+ [INFO|configuration_utils.py:765] 2025-10-24 23:56:14,663 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
601
+ [INFO|configuration_utils.py:839] 2025-10-24 23:56:14,663 >> Model config Qwen2Config {
602
+ "architectures": [
603
+ "Qwen2ForCausalLM"
604
+ ],
605
+ "attention_dropout": 0.0,
606
+ "bos_token_id": 151643,
607
+ "dtype": "bfloat16",
608
+ "eos_token_id": 151645,
609
+ "hidden_act": "silu",
610
+ "hidden_size": 1536,
611
+ "initializer_range": 0.02,
612
+ "intermediate_size": 8960,
613
+ "layer_types": [
614
+ "full_attention",
615
+ "full_attention",
616
+ "full_attention",
617
+ "full_attention",
618
+ "full_attention",
619
+ "full_attention",
620
+ "full_attention",
621
+ "full_attention",
622
+ "full_attention",
623
+ "full_attention",
624
+ "full_attention",
625
+ "full_attention",
626
+ "full_attention",
627
+ "full_attention",
628
+ "full_attention",
629
+ "full_attention",
630
+ "full_attention",
631
+ "full_attention",
632
+ "full_attention",
633
+ "full_attention",
634
+ "full_attention",
635
+ "full_attention",
636
+ "full_attention",
637
+ "full_attention",
638
+ "full_attention",
639
+ "full_attention",
640
+ "full_attention",
641
+ "full_attention"
642
+ ],
643
+ "max_position_embeddings": 32768,
644
+ "max_window_layers": 21,
645
+ "model_type": "qwen2",
646
+ "num_attention_heads": 12,
647
+ "num_hidden_layers": 28,
648
+ "num_key_value_heads": 2,
649
+ "rms_norm_eps": 1e-06,
650
+ "rope_scaling": null,
651
+ "rope_theta": 1000000.0,
652
+ "sliding_window": null,
653
+ "tie_word_embeddings": true,
654
+ "transformers_version": "4.57.1",
655
+ "use_cache": true,
656
+ "use_sliding_window": false,
657
+ "vocab_size": 151936
658
+ }
659
+
660
+ [WARNING|logging.py:328] 2025-10-24 23:56:14,663 >> `torch_dtype` is deprecated! Use `dtype` instead!
661
+ [INFO|2025-10-24 23:56:14] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
662
+ [WARNING|logging.py:328] 2025-10-24 23:56:15,013 >> `torch_dtype` is deprecated! Use `dtype` instead!
663
+ [INFO|modeling_utils.py:1172] 2025-10-24 23:56:15,014 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/model.safetensors
664
+ [INFO|modeling_utils.py:2341] 2025-10-24 23:56:15,015 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
665
+ [INFO|configuration_utils.py:986] 2025-10-24 23:56:15,016 >> Generate config GenerationConfig {
666
+ "bos_token_id": 151643,
667
+ "eos_token_id": 151645
668
+ }
669
+
670
+ [INFO|configuration_utils.py:941] 2025-10-24 23:56:15,118 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/generation_config.json
671
+ [INFO|configuration_utils.py:986] 2025-10-24 23:56:15,119 >> Generate config GenerationConfig {
672
+ "bos_token_id": 151643,
673
+ "do_sample": true,
674
+ "eos_token_id": [
675
+ 151645,
676
+ 151643
677
+ ],
678
+ "pad_token_id": 151643,
679
+ "repetition_penalty": 1.1,
680
+ "temperature": 0.7,
681
+ "top_k": 20,
682
+ "top_p": 0.8
683
+ }
684
+
685
+ [INFO|dynamic_module_utils.py:423] 2025-10-24 23:56:15,148 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-1.5B-Instruct.
686
+ [INFO|2025-10-24 23:56:15] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
687
+ [INFO|2025-10-24 23:56:17] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
688
+ [INFO|2025-10-24 23:56:17] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
689
+ [INFO|2025-10-24 23:56:17] llamafactory.model.loader:143 >> all params: 1,543,714,304
690
+ [INFO|2025-10-24 23:56:17] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
691
+ [INFO|configuration_utils.py:491] 2025-10-24 23:56:17,909 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/config.json
692
+ [INFO|configuration_utils.py:757] 2025-10-24 23:56:17,914 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/generation_config.json
693
+ [INFO|modeling_utils.py:4181] 2025-10-24 23:56:21,705 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/model.safetensors
694
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:56:21,725 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/chat_template.jinja
695
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:56:21,745 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/tokenizer_config.json
696
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:56:21,765 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/special_tokens_map.json
697
+ [INFO|2025-10-24 23:56:21] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged/Modelfile
698
+ [2025-10-24 23:56:22]
699
+ [2025-10-24 23:56:22] ========================================
700
+ [2025-10-24 23:56:22] Merge/Export completed successfully
701
+ [2025-10-24 23:56:22] End Time: Fri Oct 24 11:56:22 PM EDT 2025
702
+ [2025-10-24 23:56:22] ========================================
703
+ [2025-10-24 23:56:22]
704
+ [2025-10-24 23:56:22] ========================================
705
+ [2025-10-24 23:56:22] Preparing Training Artifacts
706
+ [2025-10-24 23:56:22] ========================================
707
+ [2025-10-24 23:56:22] Copying configuration files...
708
+ [2025-10-24 23:56:22] Copying and cleaning training logs...
training_artifacts/merge_config.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ template: qwen
2
+ trust_remote_code: true
3
+ export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/merged
4
+ model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
5
+ adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints/checkpoint-10
training_artifacts/train_config.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stage: sft
2
+ do_train: true
3
+ max_samples: 100000
4
+ do_eval: false
5
+ save_strategy: steps
6
+ save_steps: 5
7
+ logging_steps: 10
8
+ fp16: false
9
+ bf16: true
10
+ adam_beta1: 0.9
11
+ adam_beta2: 0.95
12
+ overwrite_output_dir: true
13
+ per_device_train_batch_size: 1
14
+ gradient_accumulation_steps: 1
15
+ gradient_checkpointing: true
16
+ learning_rate: 1.0e-06
17
+ lr_scheduler_type: cosine
18
+ num_train_epochs: 2
19
+ warmup_ratio: 0.05
20
+ weight_decay: 0.0001
21
+ template: qwen
22
+ max_steps: 10
23
+ preprocessing_num_workers: 16
24
+ overwrite_cache: true
25
+ model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
26
+ finetuning_type: lora
27
+ trust_remote_code: true
28
+ dataset: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
29
+ dataset_dir: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data
30
+ cutoff_len: 16192
31
+ tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
32
+ output_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_lora/checkpoints
vocab.json ADDED
The diff for this file is too large to render. See raw diff