Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,9 +3,13 @@ import streamlit as st
|
|
| 3 |
import itertools
|
| 4 |
from word_piece_tokenizer import WordPieceTokenizer
|
| 5 |
import tiktoken
|
|
|
|
| 6 |
|
| 7 |
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
### User Interface ###
|
| 10 |
st.title("Tokenization")
|
| 11 |
|
|
@@ -22,9 +26,8 @@ tokenizer = st.selectbox(
|
|
| 22 |
"Tokenizer",
|
| 23 |
(
|
| 24 |
"White Space",
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"WordPiece (BERT)",
|
| 28 |
"Byte Pair Encoding (Open AI GPT-4o)",
|
| 29 |
),
|
| 30 |
index=None,
|
|
@@ -96,31 +99,21 @@ if tokenizer == "White Space":
|
|
| 96 |
unique_tokens = unique_list(split_tokens)
|
| 97 |
st.write(stream_token_ids)
|
| 98 |
|
| 99 |
-
elif tokenizer == "
|
| 100 |
-
with st.expander("About
|
| 101 |
-
st.write(
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if token_id == True:
|
| 105 |
-
color = itertools.cycle(colors)
|
| 106 |
-
unique_tokens = unique_list(split_tokens)
|
| 107 |
-
st.write(stream_token_ids)
|
| 108 |
-
|
| 109 |
-
elif tokenizer == "Tweet Tokenizer (NLTK)":
|
| 110 |
-
with st.expander("About Tweet Tokenizer"):
|
| 111 |
-
st.write(tweet_desc)
|
| 112 |
-
split_tokens = TweetTokenizer().tokenize(txt)
|
| 113 |
st.write(stream_data)
|
| 114 |
if token_id == True:
|
| 115 |
color = itertools.cycle(colors)
|
| 116 |
-
|
| 117 |
-
st.write(stream_token_ids)
|
| 118 |
|
| 119 |
-
elif tokenizer == "
|
| 120 |
-
with st.expander("About
|
| 121 |
-
st.write(
|
| 122 |
-
ids =
|
| 123 |
-
split_tokens =
|
| 124 |
st.write(stream_data)
|
| 125 |
if token_id == True:
|
| 126 |
color = itertools.cycle(colors)
|
|
|
|
| 3 |
import itertools
|
| 4 |
from word_piece_tokenizer import WordPieceTokenizer
|
| 5 |
import tiktoken
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
|
| 8 |
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
|
| 9 |
|
| 10 |
+
qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
|
| 11 |
+
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
| 12 |
+
|
| 13 |
### User Interface ###
|
| 14 |
st.title("Tokenization")
|
| 15 |
|
|
|
|
| 26 |
"Tokenizer",
|
| 27 |
(
|
| 28 |
"White Space",
|
| 29 |
+
"Qwen2.5 Tokenizer",
|
| 30 |
+
"RuAdapt Tokenizer",
|
|
|
|
| 31 |
"Byte Pair Encoding (Open AI GPT-4o)",
|
| 32 |
),
|
| 33 |
index=None,
|
|
|
|
| 99 |
unique_tokens = unique_list(split_tokens)
|
| 100 |
st.write(stream_token_ids)
|
| 101 |
|
| 102 |
+
elif tokenizer == "Qwen2.5 Tokenizer":
|
| 103 |
+
with st.expander("About Qwen2.5 Tokenizer"):
|
| 104 |
+
st.write('')
|
| 105 |
+
ids = qwen_tokenizer.encode(txt)
|
| 106 |
+
split_tokens = qwen_tokenizer.tokenize(txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
st.write(stream_data)
|
| 108 |
if token_id == True:
|
| 109 |
color = itertools.cycle(colors)
|
| 110 |
+
st.write(stream_wp_token_ids)
|
|
|
|
| 111 |
|
| 112 |
+
elif tokenizer == "RuAdapt Tokenizer":
|
| 113 |
+
with st.expander("About RuAdapt Tokenizer"):
|
| 114 |
+
st.write('')
|
| 115 |
+
ids = ruadapt_tokenizer.encode(txt)
|
| 116 |
+
split_tokens = ruadapt_tokenizer.tokenize(txt)
|
| 117 |
st.write(stream_data)
|
| 118 |
if token_id == True:
|
| 119 |
color = itertools.cycle(colors)
|