Spaces:
Sleeping
Sleeping
| """ | |
| ## characters | |
| - alphanumeric characters | |
| - numeric characters | |
| - special characters: A special character is a character that is not an alphabetic or numeric character. | |
| - ASCII control characters | |
| - punctuation marks | |
| - accent marks | |
| - 数学符号 | |
| - whitespace: | |
| - https://en.wikipedia.org/wiki/Whitespace_character | |
| - https://emptycharacter.com/ | |
| https://www.computerhope.com/jargon/s/specchar.htm | |
| """ | |
| examples = { | |
| "en": [ | |
| ["number: (10086 + 98) = 100184", "llama", "bloom"], # | |
| ["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"], # chatglm 有blank_n, bert丢掉了空格, | |
| # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏. | |
| ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小 | |
| ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"], | |
| ["special: [PAD] [UNK] [CLS] [SEP] [MASK] "], | |
| ], | |
| "zh": [ | |
| ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n, | |
| ["标点测试:,。!?;", "baichuan_7b", "llama"], | |
| ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"], | |
| ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"], | |
| ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"], | |
| ] | |
| } | |
| more_examples = [ | |
| # bert VS clue | |
| # bert系列 | |
| ("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug, bert VS clue | |
| ("bert_base_cased", "clue", ""), | |
| # llama系列 (基于sentencepiece) | |
| ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"), | |
| ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"), | |
| ("llama", "chinese_llama2", ""), | |
| ("chinese_llama", "chinese_llama2", ""), | |
| # glm系列 (基于sentencepiece) | |
| ("glm", "chatglm1", ""), | |
| ("chatglm1", "chatglm2", ""), | |
| # gpt2系列 | |
| ("gpt2", "moss", ""), | |
| ("", "", ""), | |
| # openai系列 (tiktoken) | |
| ("qwen", "gpt_35_turbo", ""), | |
| ] | |
| lang = "en" | |
| example_types = [t[0].split(":")[0] for t in examples[lang]] | |
| def example_fn(example_idx): | |
| return examples[lang][example_idx] | |
| def get_more_example(): | |
| import urllib.parse | |
| url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena" | |
| for tokenizer1, tokenizer2, text in more_examples: | |
| full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}' | |
| print(full_url) | |
| if __name__ == "__main__": | |
| get_more_example() | |