add zephyr
Browse files- vocab/__init__.py +3 -2
- vocab/zephyr_7b_beta/__init__.py +5 -0
vocab/__init__.py
CHANGED
|
@@ -70,7 +70,7 @@ uniq_tokenizers = [
|
|
| 70 |
""
|
| 71 |
]
|
| 72 |
|
| 73 |
-
# TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
|
| 74 |
all_tokenizers = [
|
| 75 |
##### bert 系列
|
| 76 |
("bert_base_cased", "", "bert"),
|
|
@@ -99,7 +99,7 @@ all_tokenizers = [
|
|
| 99 |
("chatyuan_large_v2", "", "sentencepiece"),
|
| 100 |
("prompt_clue", "", "sentencepiece"),
|
| 101 |
|
| 102 |
-
("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
|
| 103 |
("llama2", "", "sentencepiece"),
|
| 104 |
("chinese_llama", "", "sentencepiece"), #
|
| 105 |
("chinese_llama2", "", "sentencepiece"), #
|
|
@@ -168,6 +168,7 @@ all_tokenizers = [
|
|
| 168 |
("gemma_7b",),
|
| 169 |
("olmo_7b",),
|
| 170 |
("aya_101",),
|
|
|
|
| 171 |
]
|
| 172 |
|
| 173 |
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
|
|
|
| 70 |
""
|
| 71 |
]
|
| 72 |
|
| 73 |
+
# TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
|
| 74 |
all_tokenizers = [
|
| 75 |
##### bert 系列
|
| 76 |
("bert_base_cased", "", "bert"),
|
|
|
|
| 99 |
("chatyuan_large_v2", "", "sentencepiece"),
|
| 100 |
("prompt_clue", "", "sentencepiece"),
|
| 101 |
|
| 102 |
+
("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
|
| 103 |
("llama2", "", "sentencepiece"),
|
| 104 |
("chinese_llama", "", "sentencepiece"), #
|
| 105 |
("chinese_llama2", "", "sentencepiece"), #
|
|
|
|
| 168 |
("gemma_7b",),
|
| 169 |
("olmo_7b",),
|
| 170 |
("aya_101",),
|
| 171 |
+
("zephyr_7b_beta",)
|
| 172 |
]
|
| 173 |
|
| 174 |
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
vocab/zephyr_7b_beta/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
|
| 5 |
+
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|