| import sys | |
| def clean_vocab(in_vocab_fname: str, out_vocab_fname: str): | |
| """ | |
| Cleans a vocabulary file by filtering out invalid lines. | |
| Args: | |
| in_vocab_fname (str): path of the input vocabulary file. | |
| out_vocab_fname (str): path of the input vocabulary file. | |
| """ | |
| with open(in_vocab_fname, "r", encoding="utf-8") as infile, open( | |
| out_vocab_fname, "w", encoding="utf-8" | |
| ) as outfile: | |
| for i, line in enumerate(infile): | |
| fields = line.strip("\r\n ").split(" ") | |
| if len(fields) == 2: | |
| outfile.write(line) | |
| if len(fields) != 2: | |
| print(f"{i}: {line.strip()}") | |
| for c in line: | |
| print(f"{c}:{hex(ord(c))}") | |
| if __name__ == "__main__": | |
| in_vocab_fname = sys.argv[1] | |
| out_vocab_fname = sys.argv[2] | |
| clean_vocab(in_vocab_fname, out_vocab_fname) | |