Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,10 @@ qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
|
|
| 12 |
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
| 13 |
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
|
| 14 |
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
### User Interface ###
|
| 17 |
st.title("Tokenization")
|
|
@@ -34,6 +38,8 @@ tokenizer = st.selectbox(
|
|
| 34 |
"Aya-Expanse Tokenizer",
|
| 35 |
"Open AI GPT-4o Tokenizer",
|
| 36 |
"Anthropic Claude Tokenizer",
|
|
|
|
|
|
|
| 37 |
),
|
| 38 |
index=None,
|
| 39 |
placeholder="Select a tokenizer",
|
|
@@ -166,4 +172,25 @@ elif tokenizer == "Anthropic Claude Tokenizer":
|
|
| 166 |
color = itertools.cycle(colors)
|
| 167 |
st.write(stream_wp_token_ids)
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
st.write(num_tokens(txt))
|
|
|
|
| 12 |
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
| 13 |
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
|
| 14 |
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
|
| 15 |
+
xlmv_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
|
| 16 |
+
nllb_tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
|
| 17 |
+
|
| 18 |
+
|
| 19 |
|
| 20 |
### User Interface ###
|
| 21 |
st.title("Tokenization")
|
|
|
|
| 38 |
"Aya-Expanse Tokenizer",
|
| 39 |
"Open AI GPT-4o Tokenizer",
|
| 40 |
"Anthropic Claude Tokenizer",
|
| 41 |
+
"XLM-V Tokenizer",
|
| 42 |
+
"NLLB-200 Tokenizer",
|
| 43 |
),
|
| 44 |
index=None,
|
| 45 |
placeholder="Select a tokenizer",
|
|
|
|
| 172 |
color = itertools.cycle(colors)
|
| 173 |
st.write(stream_wp_token_ids)
|
| 174 |
|
| 175 |
+
elif tokenizer == "XLM-V Tokenizer":
|
| 176 |
+
with st.expander("About XLM-V Tokenizer"):
|
| 177 |
+
st.write('')
|
| 178 |
+
ids = xlmv_tokenizer.encode(txt)
|
| 179 |
+
split_tokens = [xlmv_tokenizer.decode([t]) for t in ids]
|
| 180 |
+
st.write(stream_data)
|
| 181 |
+
if token_id == True:
|
| 182 |
+
color = itertools.cycle(colors)
|
| 183 |
+
st.write(stream_wp_token_ids)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
elif tokenizer == "NLLB-200 Tokenizer":
|
| 187 |
+
with st.expander("About NLLB-200 Tokenizer"):
|
| 188 |
+
st.write('')
|
| 189 |
+
ids = nllb_tokenizer.encode(txt)
|
| 190 |
+
split_tokens = [nllb_tokenizer.decode([t]) for t in ids]
|
| 191 |
+
st.write(stream_data)
|
| 192 |
+
if token_id == True:
|
| 193 |
+
color = itertools.cycle(colors)
|
| 194 |
+
st.write(stream_wp_token_ids)
|
| 195 |
+
|
| 196 |
st.write(num_tokens(txt))
|