Spaces:
Running
Running
| import os | |
| from typing import List | |
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| import tiktoken | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| hf_tokenizer_list = [ | |
| ("tugstugi/bert-large-mongolian-cased", False), | |
| ("tugstugi/bert-large-mongolian-uncased", False), | |
| ("bayartsogt/mongolian-roberta-large", True), | |
| ("meta-llama/Llama-2-13b-hf", True), | |
| ("tiiuae/falcon-7b", True), | |
| ("bigscience/bloom", True), | |
| ] | |
| openai_tokenizer_list = [ | |
| "text-davinci-003", | |
| "gpt-4" | |
| ] | |
| # load tokenizers | |
| hf_tokenizers = [ | |
| AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast, trust_remote_code=True, token=HF_TOKEN) | |
| for model_name_or_id, use_fast in hf_tokenizer_list | |
| ] | |
| openai_tokenizers = [ | |
| tiktoken.encoding_for_model(name) | |
| for name in openai_tokenizer_list | |
| ] | |
| def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]: | |
| return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))] | |
| def do_simple_split(text: str): | |
| return [(x, str(i)) for i, x in enumerate(text.split())] | |
| def do_function(text: str): | |
| return ( | |
| text, | |
| len(text), | |
| do_simple_split(text), | |
| *[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers], | |
| *[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers], | |
| ) | |
| demo = gr.Interface( | |
| do_function, | |
| [ | |
| gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй") | |
| ], | |
| [ | |
| gr.Text("", label="input"), | |
| gr.Number(0, label="Character Count"), | |
| gr.HighlightedText("", label="Simple Split"), | |
| *[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list], | |
| *[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list], | |
| ], | |
| live=True, | |
| allow_flagging="never", | |
| title="Real-Time Tokenizer", | |
| description=( | |
| "**Tokenizers:**\n" + | |
| "\n".join( | |
| [ | |
| f"🤗 [{x}](https://huggingface.co/{x})" | |
| for x, _ in hf_tokenizer_list | |
| ] + [ | |
| f"⏳ [{x}](https://github.com/openai/tiktoken)" | |
| for x in openai_tokenizer_list | |
| ]) | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |