soldni commited on
Commit
f6703ac
·
verified ·
1 Parent(s): 9708967

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Dolma 2 tokenizer, Instruct v2, reasoner version
7
+
8
+ Slightly modified version of `cl100k_base` that supports Dolma 1.x and Dolma 2.x special tokens.
9
+
10
+ ## Special tokens
11
+
12
+ This tokenizer supports the following special tokens:
13
+
14
+ - `<|extra_id_0|>`: Not used.
15
+ - `<|endoftext|>`: Used to mark both beginning and end of text.
16
+ - `<|fim_prefix|>`: Used to mark the prefix fill-in-the-middle request.
17
+ - `<|fim_middle|>`: Used to mark the middle fill-in-the-middle request.
18
+ - `<|fim_suffix|>`: Used to mark the suffix fill-in-the-middle request.
19
+ - `|||PHONE_NUMBER|||`: Not used. Kept for compatibility with Dolma 1.x.
20
+ - `|||EMAIL_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
21
+ - `|||IP_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
22
+ - `<|im_start|>`: Indicates the beginning of a message (turn in a conversation).
23
+ - `<|im_end|>`: Indicates the end of a message (turn in a conversation).
24
+ - `<|extra_id_1|>`: Not used.
25
+ - `<|extra_id_2|>`: Not used.
26
+ - `<think>`: Indicates the beginning of model thoughts.
27
+ - `</think>`: Indicates the end of model thoughts.
28
+ - `<|extra_id_3|>`: Not used.
29
+ - `<|extra_id_4|>`: Not used.
30
+ - `<|extra_id_5|>`: Not used.
31
+ - `<|extra_id_6|>`: Not used.
32
+ - `<answer>`: Indicates the beginning of model answer in thinking mode.
33
+ - `</answer>`: Indicates the end of model answer in thinking mode.
34
+ - `<|endofprompt|>`: Not Used.
35
+ - `<|pad|>`: Symbol to pad input sequences.
36
+
37
+
38
+ ## Chat template
39
+
40
+ The chat template is as follows (**for reference only**, actual template is in `tokenizer_config.json`):
41
+
42
+ ```jinja
43
+ {% set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 %}
44
+ {% if not has_system %}
45
+ {{ '<|im_start|>system
46
+ You are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai.<|im_end|>
47
+ ' }}
48
+ {% endif %}
49
+ {% for message in messages %}
50
+ {% if message['role'] == 'system' %}
51
+ {{ '<|im_start|>system
52
+ ' + message['content'] }}
53
+ {% if message.get('functions', none) is not none %}
54
+ {{ ' <functions>' + message['functions'] + '</functions><|im_end|>
55
+ ' }}
56
+ {% else %}
57
+ {{ ' You do not currently have access to any functions. <functions></functions><|im_end|>
58
+ ' }}
59
+ {% endif %}
60
+ {% elif message['role'] == 'user' %}
61
+ {% if message.get('functions', none) is not none %}
62
+ {{ '<|im_start|>user
63
+ ' + message['content'] + '
64
+ ' + '<functions>' + message['functions'] + '</functions><|im_end|>
65
+ ' }}
66
+ {% else %}
67
+ {{ '<|im_start|>user
68
+ ' + message['content'] + '<|im_end|>
69
+ ' }}
70
+ {% endif %}
71
+ {% elif message['role'] == 'assistant' %}
72
+ {{ '<|im_start|>assistant
73
+ ' }}
74
+ {% if message.get('content', none) is not none %}
75
+ {{ message['content'] }}
76
+ {% endif %}
77
+ {% if message.get('function_calls', none) is not none %}
78
+ {{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}
79
+ {% endif %}
80
+ {% if not loop.last %}
81
+ {{ '<|im_end|>' + '
82
+ ' }}
83
+ {% else %}
84
+ {{ eos_token }}
85
+ {% endif %}
86
+ {% elif message['role'] == 'environment' %}
87
+ {{ '<|im_start|>environment
88
+ ' + message['content'] + '<|im_end|>
89
+ ' }}
90
+ {% endif %}
91
+ {% if loop.last and add_generation_prompt %}
92
+ {{ '<|im_start|>assistant
93
+ <think>' }}
94
+ {% endif %}
95
+ {% endfor %}
96
+ ```
fix_tokens.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env -S uv run --script
2
+ # /// script
3
+ # requires-python = ">=3.11"
4
+ # dependencies = [
5
+ # "click",
6
+ # ]
7
+ # ///
8
+
9
+ from dataclasses import dataclass, asdict, field
10
+ from enum import Enum
11
+ from pathlib import Path
12
+ import click
13
+ import json
14
+
15
+
16
+ class SpecialTokensMapEnum(Enum):
17
+ BOS_TOKEN = "bos_token"
18
+ EOS_TOKEN = "eos_token"
19
+ PAD_TOKEN = "pad_token"
20
+ UNK_TOKEN = "unk_token"
21
+
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class SpecialToken:
26
+ id: int
27
+ content: str
28
+ lstrip: bool = False
29
+ normalized: bool = False
30
+ rstrip: bool = False
31
+ single_word: bool = False
32
+ special: bool = False
33
+ special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)
34
+
35
+ def to_added_tokens_decoder(self):
36
+ data = asdict(self)
37
+ token_id = str(data.pop("id"))
38
+ data.pop("special_token_map")
39
+ return {token_id: data}
40
+
41
+ def to_added_tokens(self):
42
+ data = asdict(self)
43
+ data.pop("special_token_map")
44
+ return data
45
+
46
+ def to_special_tokens_map(self) -> dict[str, dict]:
47
+ special_tokens_map = {}
48
+ for special_token_map in self.special_token_map:
49
+ data = asdict(self)
50
+ data.pop("special_token_map")
51
+ data.pop("special")
52
+ data.pop("id")
53
+ special_tokens_map[special_token_map.value] = data
54
+
55
+ return special_tokens_map
56
+
57
+
58
+ MODEL_MAX_LENGTH = 65536
59
+
60
+ DESIRED_MAPPING = [
61
+ SpecialToken(id=100256, content="<|extra_id_0|>"),
62
+ SpecialToken(
63
+ id=100257,
64
+ content="<|endoftext|>",
65
+ special=True,
66
+ special_token_map=[
67
+ SpecialTokensMapEnum.BOS_TOKEN,
68
+ SpecialTokensMapEnum.EOS_TOKEN,
69
+ SpecialTokensMapEnum.UNK_TOKEN,
70
+ ]),
71
+ SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
72
+ SpecialToken(id=100259, content="<|fim_middle|>", special=True),
73
+ SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
74
+ SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
75
+ SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
76
+ SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
77
+ SpecialToken(id=100264, content="<|im_start|>", special=True),
78
+ SpecialToken(id=100265, content="<|im_end|>", special=True),
79
+ SpecialToken(id=100266, content="<|extra_id_1|>"),
80
+ SpecialToken(id=100267, content="<|extra_id_2|>"),
81
+ SpecialToken(id=100268, content="<|extra_id_3|>"),
82
+ SpecialToken(id=100269, content="<|extra_id_4|>"),
83
+ SpecialToken(id=100270, content="<|extra_id_5|>"),
84
+ SpecialToken(id=100271, content="<|extra_id_6|>"),
85
+ SpecialToken(id=100272, content="<|extra_id_7|>"),
86
+ SpecialToken(id=100273, content="<|extra_id_8|>"),
87
+ SpecialToken(id=100274, content="<|extra_id_9|>"),
88
+ SpecialToken(id=100275, content="<|extra_id_10|>"),
89
+ SpecialToken(id=100276, content="<|endofprompt|>", special=True),
90
+ SpecialToken(
91
+ id=100277,
92
+ content="<|pad|>",
93
+ special=True,
94
+ special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
95
+ ),
96
+ ]
97
+
98
+ SCRIPT_DIR = Path(__file__).parent
99
+ TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
100
+ TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
101
+ VOCAB_FILE = SCRIPT_DIR / "vocab.json"
102
+ SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"
103
+
104
+
105
+
106
+ @click.group()
107
+ def cli():
108
+ """Dataset processing tools."""
109
+ pass
110
+
111
+
112
+
113
+ def _get_mapped_special_token(
114
+ special_tokens: list[SpecialToken],
115
+ mapped_token: SpecialTokensMapEnum
116
+ ) -> SpecialToken:
117
+ all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
118
+ if len(all_mapped_tokens) == 0:
119
+ raise ValueError(f"Cannot find mapped token for {mapped_token}")
120
+ if len(all_mapped_tokens) > 1:
121
+ all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
122
+ raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
123
+ return all_mapped_tokens[0]
124
+
125
+
126
+ def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
127
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)
128
+
129
+
130
+ def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
131
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)
132
+
133
+
134
+ def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
135
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)
136
+
137
+
138
+ def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
139
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)
140
+
141
+
142
+ @cli.command()
143
+ def check():
144
+ """Check if the current config matches the desired mapping."""
145
+
146
+ # STEP 1: Check the Tokenizer Config File #
147
+ print("STEP 1: Checking tokenizer config file...")
148
+
149
+ if not TOKENIZER_CONFIG_FILE.exists():
150
+ raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")
151
+
152
+ with open(TOKENIZER_CONFIG_FILE, "r") as f:
153
+ tokenizer_config = json.load(f)
154
+
155
+ added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
156
+ for token in DESIRED_MAPPING:
157
+ str_token_id = str(token.id)
158
+ if str_token_id not in added_tokens_decoder:
159
+ raise ValueError(f"Token {token.id} not found in added tokens decoder")
160
+
161
+ computed_added_tokens_decoder = token.to_added_tokens_decoder()
162
+ if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
163
+ raise ValueError(f"Token {token.id} has different content in added tokens decoder")
164
+
165
+ print(f"Token {token.id} found in added tokens decoder; content matches")
166
+
167
+ bos_token = get_bos_token(DESIRED_MAPPING)
168
+ if bos_token.content != tokenizer_config["bos_token"]:
169
+ raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
170
+ else:
171
+ print("Bos token content matches")
172
+
173
+ eos_token = get_eos_token(DESIRED_MAPPING)
174
+ if eos_token.content != tokenizer_config["eos_token"]:
175
+ raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
176
+ else:
177
+ print("Eos token content matches")
178
+
179
+ pad_token = get_pad_token(DESIRED_MAPPING)
180
+ if pad_token.content != tokenizer_config["pad_token"]:
181
+ raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
182
+ else:
183
+ print("Pad token content matches")
184
+
185
+ unk_token = get_unk_token(DESIRED_MAPPING)
186
+ if unk_token.content != tokenizer_config["unk_token"]:
187
+ raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
188
+ else:
189
+ print("Unk token content matches")
190
+
191
+ if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
192
+ raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
193
+ else:
194
+ print("Model max length matches")
195
+
196
+
197
+ # STEP 2: Check the Tokenizer File #
198
+ print("STEP 2: Checking tokenizer file...")
199
+
200
+ if not TOKENIZER_FILE.exists():
201
+ raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")
202
+
203
+ with open(TOKENIZER_FILE, "r") as f:
204
+ tokenizer = json.load(f)
205
+
206
+ # check if added_tokens matches
207
+ added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
208
+ for token in DESIRED_MAPPING:
209
+ if token.id not in added_tokens_dict:
210
+ raise ValueError(f"Token {token.id} not found in added tokens")
211
+
212
+ computed_added_token = token.to_added_tokens()
213
+ if computed_added_token != added_tokens_dict[token.id]:
214
+ raise ValueError(f"Token {token.id} has different content in added tokens")
215
+ print(f"Token {token.id} found in added tokens; content matches.")
216
+
217
+ # check vocab
218
+ vocab = tokenizer.get("model", {}).get("vocab", {})
219
+ for token in DESIRED_MAPPING:
220
+ if token.content not in vocab:
221
+ raise ValueError(f"Token `{token.content}` not found in vocab")
222
+ if token.id != vocab[token.content]:
223
+ raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
224
+ print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
225
+
226
+ seen_values: dict[int, list[str]] = {}
227
+ for key, value in vocab.items():
228
+ seen_values.setdefault(value, []).append(key)
229
+
230
+ broken_vocab = False
231
+ for value, keys in seen_values.items():
232
+ if len(keys) > 1:
233
+ broken_vocab = True
234
+ print(f"Vocab value {value} is not unique; keys: {keys}")
235
+
236
+ if broken_vocab:
237
+ raise ValueError("Vocab values are not unique")
238
+
239
+ else:
240
+ print("Vocab values are unique")
241
+
242
+ # STEP 3: Check the Vocab File #
243
+ print("STEP 3: Checking vocab file...")
244
+
245
+ if not VOCAB_FILE.exists():
246
+ raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")
247
+
248
+ with open(VOCAB_FILE, "r") as f:
249
+ vocab = json.load(f)
250
+
251
+ for token in DESIRED_MAPPING:
252
+ if token.content not in vocab:
253
+ raise ValueError(f"Token `{token.content}` not found in vocab")
254
+ if token.id != vocab[token.content]:
255
+ raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
256
+ print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
257
+
258
+ if len(set(vocab.values())) != len(vocab):
259
+ raise ValueError("Vocab values are not unique")
260
+
261
+ # STEP 4: Check the Special Tokens Map File #
262
+ print("STEP 4: Checking special tokens map file...")
263
+
264
+ if not SPECIAL_TOKENS_MAP_FILE.exists():
265
+ raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")
266
+
267
+ with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
268
+ special_tokens_map = json.load(f)
269
+
270
+ # This checks the special tokens map file.
271
+ seen_special_tokens = set()
272
+ for token in DESIRED_MAPPING:
273
+ for key, value in token.to_special_tokens_map().items():
274
+ if key not in special_tokens_map:
275
+ raise ValueError(f"Special token map {key} not found in special tokens map")
276
+ if value != special_tokens_map[key]:
277
+ raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")
278
+
279
+ print(f"Special token map {key} content matches")
280
+ seen_special_tokens.add(key)
281
+
282
+ if len(seen_special_tokens) != len(special_tokens_map):
283
+ raise ValueError("Special tokens map values are not unique")
284
+ print("All special tokens map values match")
285
+
286
+
287
+ @cli.command()
288
+ def fix():
289
+ """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""
290
+
291
+ print("STEP 1: Fixing tokenizer config file...")
292
+ with open(TOKENIZER_CONFIG_FILE, "r") as f:
293
+ tokenizer_config = json.load(f)
294
+
295
+ tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
296
+ tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
297
+ tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
298
+ tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
299
+ tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH
300
+
301
+ added_tokens_decoder = {}
302
+ for token in DESIRED_MAPPING:
303
+ added_tokens_decoder.update(token.to_added_tokens_decoder())
304
+ tokenizer_config["added_tokens_decoder"] = added_tokens_decoder
305
+
306
+ with open(TOKENIZER_CONFIG_FILE, "w") as f:
307
+ json.dump(tokenizer_config, f, indent=2)
308
+ print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")
309
+
310
+
311
+ print("STEP 2: Fixing tokenizer file...")
312
+ with open(TOKENIZER_FILE, "r") as f:
313
+ tokenizer = json.load(f)
314
+ added_tokens = []
315
+ for token in DESIRED_MAPPING:
316
+ added_tokens.append(token.to_added_tokens())
317
+ tokenizer["added_tokens"] = added_tokens
318
+
319
+ for token in DESIRED_MAPPING:
320
+ # check if vocab id is used already
321
+ for key in list(tokenizer["model"]["vocab"].keys()):
322
+ if tokenizer["model"]["vocab"][key] == token.id:
323
+ tokenizer["model"]["vocab"].pop(key)
324
+
325
+ # now that we know this is safe, add the token
326
+ tokenizer["model"]["vocab"][token.content] = token.id
327
+
328
+ with open(TOKENIZER_FILE, "w") as f:
329
+ json.dump(tokenizer, f, indent=2)
330
+
331
+ print(f"Updated tokenizer file in {TOKENIZER_FILE}.")
332
+
333
+ print("STEP 3: Fixing vocab file...")
334
+ with open(VOCAB_FILE, "r") as f:
335
+ vocab = json.load(f)
336
+ for token in DESIRED_MAPPING:
337
+ # check if vocab id is used already
338
+ for key in list(vocab.keys()):
339
+ if vocab[key] == token.id:
340
+ vocab.pop(key)
341
+
342
+ # now that we know this is safe, add the token
343
+ vocab[token.content] = token.id
344
+ with open(VOCAB_FILE, "w") as f:
345
+ json.dump(vocab, f, indent=2)
346
+ print(f"Updated vocab file in {VOCAB_FILE}.")
347
+
348
+ print("STEP 4: Fixing special tokens map file...")
349
+ with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
350
+ special_tokens_map = json.load(f)
351
+
352
+ for token in DESIRED_MAPPING:
353
+ for key, value in token.to_special_tokens_map().items():
354
+ special_tokens_map[key] = value
355
+ print(f"Updated special token map {key} content")
356
+
357
+ with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
358
+ json.dump(special_tokens_map, f, indent=2)
359
+
360
+ print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")
361
+
362
+
363
+ if __name__ == "__main__":
364
+ cli()
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 100265,
5
+ 100257
6
+ ],
7
+ "pad_token": 100277,
8
+ "transformers_version": "4.53.1"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100256": {
5
+ "content": "<|extra_id_0|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": false
11
+ },
12
+ "100257": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100258": {
21
+ "content": "<|fim_prefix|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100259": {
29
+ "content": "<|fim_middle|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100260": {
37
+ "content": "<|fim_suffix|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100261": {
45
+ "content": "|||PHONE_NUMBER|||",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "100262": {
53
+ "content": "|||EMAIL_ADDRESS|||",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "100263": {
61
+ "content": "|||IP_ADDRESS|||",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "100264": {
69
+ "content": "<|im_start|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100265": {
77
+ "content": "<|im_end|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "100266": {
85
+ "content": "<|extra_id_1|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "100267": {
93
+ "content": "<|extra_id_2|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "100268": {
101
+ "content": "<|extra_id_3|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "100269": {
109
+ "content": "<|extra_id_4|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "100270": {
117
+ "content": "<|extra_id_5|>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "100271": {
125
+ "content": "<|extra_id_6|>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "100272": {
133
+ "content": "<|extra_id_7|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "100273": {
141
+ "content": "<|extra_id_8|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "100274": {
149
+ "content": "<|extra_id_9|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "100275": {
157
+ "content": "<|extra_id_10|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "100276": {
165
+ "content": "<|endofprompt|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "100277": {
173
+ "content": "<|pad|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ }
180
+ },
181
+ "bos_token": "<|endoftext|>",
182
+ "chat_template": "{% set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 %}{% if not has_system %}{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai.<|im_end|>\n' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|im_start|>system\n' + message['content'] }}{% if message.get('functions', none) is not none %}{{ ' <functions>' + message['functions'] + '</functions><|im_end|>\n' }}{% else %}{{ ' You do not currently have access to any functions. <functions></functions><|im_end|>\n' }}{% endif %}{% elif message['role'] == 'user' %}{% if message.get('functions', none) is not none %}{{ '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' }}{% else %}{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% elif message['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' }}{% if message.get('content', none) is not none %}{{ message['content'] }}{% endif %}{% if message.get('function_calls', none) is not none %}{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}{% endif %}{% if not loop.last %}{{ '<|im_end|>' + '\n' }}{% else %}{{ eos_token }}{% endif %}{% elif message['role'] == 'environment' %}{{ '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|im_start|>assistant\n<think>' }}{% endif %}{% endfor %}",
183
+ "clean_up_tokenization_spaces": false,
184
+ "eos_token": "<|endoftext|>",
185
+ "model_max_length": 65536,
186
+ "pad_token": "<|pad|>",
187
+ "tokenizer_class": "GPT2Tokenizer",
188
+ "unk_token": "<|endoftext|>"
189
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff