File size: 13,514 Bytes
f6703ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "click",
# ]
# ///

from dataclasses import dataclass, asdict, field
from enum import Enum
from pathlib import Path
import click
import json


class SpecialTokensMapEnum(Enum):
    BOS_TOKEN = "bos_token"
    EOS_TOKEN = "eos_token"
    PAD_TOKEN = "pad_token"
    UNK_TOKEN = "unk_token"



@dataclass(frozen=True)
class SpecialToken:
    id: int
    content: str
    lstrip: bool = False
    normalized: bool = False
    rstrip: bool = False
    single_word: bool = False
    special: bool = False
    special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)

    def to_added_tokens_decoder(self):
        data = asdict(self)
        token_id = str(data.pop("id"))
        data.pop("special_token_map")
        return {token_id: data}

    def to_added_tokens(self):
        data = asdict(self)
        data.pop("special_token_map")
        return data

    def to_special_tokens_map(self) -> dict[str, dict]:
        special_tokens_map = {}
        for special_token_map in self.special_token_map:
            data = asdict(self)
            data.pop("special_token_map")
            data.pop("special")
            data.pop("id")
            special_tokens_map[special_token_map.value] = data

        return special_tokens_map


MODEL_MAX_LENGTH = 65536

DESIRED_MAPPING = [
      SpecialToken(id=100256, content="<|extra_id_0|>"),
      SpecialToken(
        id=100257,
        content="<|endoftext|>",
        special=True,
        special_token_map=[
            SpecialTokensMapEnum.BOS_TOKEN,
            SpecialTokensMapEnum.EOS_TOKEN,
            SpecialTokensMapEnum.UNK_TOKEN,
        ]),
      SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
      SpecialToken(id=100259, content="<|fim_middle|>", special=True),
      SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
      SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
      SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
      SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
      SpecialToken(id=100264, content="<|im_start|>", special=True),
      SpecialToken(id=100265, content="<|im_end|>", special=True),
      SpecialToken(id=100266, content="<|extra_id_1|>"),
      SpecialToken(id=100267, content="<|extra_id_2|>"),
      SpecialToken(id=100268, content="<|extra_id_3|>"),
      SpecialToken(id=100269, content="<|extra_id_4|>"),
      SpecialToken(id=100270, content="<|extra_id_5|>"),
      SpecialToken(id=100271, content="<|extra_id_6|>"),
      SpecialToken(id=100272, content="<|extra_id_7|>"),
      SpecialToken(id=100273, content="<|extra_id_8|>"),
      SpecialToken(id=100274, content="<|extra_id_9|>"),
      SpecialToken(id=100275, content="<|extra_id_10|>"),
      SpecialToken(id=100276, content="<|endofprompt|>", special=True),
      SpecialToken(
        id=100277,
        content="<|pad|>",
        special=True,
        special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
      ),
]

SCRIPT_DIR = Path(__file__).parent
TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
VOCAB_FILE = SCRIPT_DIR / "vocab.json"
SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"



@click.group()
def cli():
    """Dataset processing tools."""
    pass



def _get_mapped_special_token(
    special_tokens: list[SpecialToken],
    mapped_token: SpecialTokensMapEnum
) -> SpecialToken:
    all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
    if len(all_mapped_tokens) == 0:
        raise ValueError(f"Cannot find mapped token for {mapped_token}")
    if len(all_mapped_tokens) > 1:
        all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
        raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
    return all_mapped_tokens[0]


def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)


def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)


def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)


def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)


@cli.command()
def check():
    """Check if the current config matches the desired mapping."""

    # STEP 1: Check the Tokenizer Config File #
    print("STEP 1: Checking tokenizer config file...")

    if not TOKENIZER_CONFIG_FILE.exists():
        raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")

    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
    for token in DESIRED_MAPPING:
        str_token_id = str(token.id)
        if str_token_id not in added_tokens_decoder:
            raise ValueError(f"Token {token.id} not found in added tokens decoder")

        computed_added_tokens_decoder = token.to_added_tokens_decoder()
        if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
            raise ValueError(f"Token {token.id} has different content in added tokens decoder")

        print(f"Token {token.id} found in added tokens decoder; content matches")

    bos_token = get_bos_token(DESIRED_MAPPING)
    if bos_token.content != tokenizer_config["bos_token"]:
        raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
    else:
        print("Bos token content matches")

    eos_token = get_eos_token(DESIRED_MAPPING)
    if eos_token.content != tokenizer_config["eos_token"]:
        raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
    else:
        print("Eos token content matches")

    pad_token = get_pad_token(DESIRED_MAPPING)
    if pad_token.content != tokenizer_config["pad_token"]:
        raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
    else:
        print("Pad token content matches")

    unk_token = get_unk_token(DESIRED_MAPPING)
    if unk_token.content != tokenizer_config["unk_token"]:
        raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
    else:
        print("Unk token content matches")

    if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
        raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
    else:
        print("Model max length matches")


    # STEP 2: Check the Tokenizer File #
    print("STEP 2: Checking tokenizer file...")

    if not TOKENIZER_FILE.exists():
        raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")

    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)

    # check if added_tokens matches
    added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
    for token in DESIRED_MAPPING:
        if token.id not in added_tokens_dict:
            raise ValueError(f"Token {token.id} not found in added tokens")

        computed_added_token = token.to_added_tokens()
        if computed_added_token != added_tokens_dict[token.id]:
            raise ValueError(f"Token {token.id} has different content in added tokens")
        print(f"Token {token.id} found in added tokens; content matches.")

    # check vocab
    vocab = tokenizer.get("model", {}).get("vocab", {})
    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    seen_values: dict[int, list[str]] = {}
    for key, value in vocab.items():
        seen_values.setdefault(value, []).append(key)

    broken_vocab = False
    for value, keys in seen_values.items():
        if len(keys) > 1:
            broken_vocab = True
            print(f"Vocab value {value} is not unique; keys: {keys}")

    if broken_vocab:
        raise ValueError("Vocab values are not unique")

    else:
        print("Vocab values are unique")

    # STEP 3: Check the Vocab File #
    print("STEP 3: Checking vocab file...")

    if not VOCAB_FILE.exists():
        raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")

    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)

    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    if len(set(vocab.values())) != len(vocab):
        raise ValueError("Vocab values are not unique")

    # STEP 4: Check the Special Tokens Map File #
    print("STEP 4: Checking special tokens map file...")

    if not SPECIAL_TOKENS_MAP_FILE.exists():
        raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")

    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    # This checks the special tokens map file.
    seen_special_tokens = set()
    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            if key not in special_tokens_map:
                raise ValueError(f"Special token map {key} not found in special tokens map")
            if value != special_tokens_map[key]:
                raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")

            print(f"Special token map {key} content matches")
            seen_special_tokens.add(key)

    if len(seen_special_tokens) != len(special_tokens_map):
        raise ValueError("Special tokens map values are not unique")
    print("All special tokens map values match")


@cli.command()
def fix():
    """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""

    print("STEP 1: Fixing tokenizer config file...")
    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
    tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
    tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
    tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
    tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH

    added_tokens_decoder = {}
    for token in DESIRED_MAPPING:
        added_tokens_decoder.update(token.to_added_tokens_decoder())
    tokenizer_config["added_tokens_decoder"] = added_tokens_decoder

    with open(TOKENIZER_CONFIG_FILE, "w") as f:
        json.dump(tokenizer_config, f, indent=2)
    print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")


    print("STEP 2: Fixing tokenizer file...")
    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)
    added_tokens = []
    for token in DESIRED_MAPPING:
        added_tokens.append(token.to_added_tokens())
    tokenizer["added_tokens"] = added_tokens

    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(tokenizer["model"]["vocab"].keys()):
            if tokenizer["model"]["vocab"][key] == token.id:
                tokenizer["model"]["vocab"].pop(key)

        # now that we know this is safe, add the token
        tokenizer["model"]["vocab"][token.content] = token.id

    with open(TOKENIZER_FILE, "w") as f:
        json.dump(tokenizer, f, indent=2)

    print(f"Updated tokenizer file in {TOKENIZER_FILE}.")

    print("STEP 3: Fixing vocab file...")
    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)
    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(vocab.keys()):
            if vocab[key] == token.id:
                vocab.pop(key)

        # now that we know this is safe, add the token
        vocab[token.content] = token.id
    with open(VOCAB_FILE, "w") as f:
        json.dump(vocab, f, indent=2)
    print(f"Updated vocab file in {VOCAB_FILE}.")

    print("STEP 4: Fixing special tokens map file...")
    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            special_tokens_map[key] = value
            print(f"Updated special token map {key} content")

    with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
        json.dump(special_tokens_map, f, indent=2)

    print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")


if __name__ == "__main__":
    cli()