Gül Sena Altıntaş
commited on
Commit
·
cb0e70e
1
Parent(s):
aebf6ac
Fixed supertoken tokenizer loading
Browse files
app.py
CHANGED
|
@@ -73,6 +73,22 @@ def parse_dataset(text):
|
|
| 73 |
error_msg = '\n'.join(errors) if errors else ""
|
| 74 |
return questions, error_msg
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
|
| 78 |
"""Load model and tokenizer with caching"""
|
|
@@ -97,7 +113,7 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
|
|
| 97 |
progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
|
| 98 |
|
| 99 |
# Load tokenizer
|
| 100 |
-
tokenizer =
|
| 101 |
|
| 102 |
# Add pad token if missing
|
| 103 |
if tokenizer.pad_token is None:
|
|
|
|
| 73 |
error_msg = '\n'.join(errors) if errors else ""
|
| 74 |
return questions, error_msg
|
| 75 |
|
| 76 |
+
def setup_tokenizer(model_path):
|
| 77 |
+
tokenizer_name = model_path
|
| 78 |
+
if "supertoken" in model_path:
|
| 79 |
+
from huggingface_hub import list_repo_files, hf_hub_download
|
| 80 |
+
import json
|
| 81 |
+
files = list_repo_files(model_path)
|
| 82 |
+
if "tokenizer_config.json" in files:
|
| 83 |
+
tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer_config.json")
|
| 84 |
+
with open(tokenizer_path) as f:
|
| 85 |
+
tok_config = json.load(f)["data"]["tokenizer"]
|
| 86 |
+
if tok_config["name"] == "huggingface":
|
| 87 |
+
tokenizer_name = tok_config["path"]
|
| 88 |
+
# todo: tiktoken
|
| 89 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True, legacy=True)
|
| 90 |
+
return tokenizer
|
| 91 |
+
|
| 92 |
|
| 93 |
def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
|
| 94 |
"""Load model and tokenizer with caching"""
|
|
|
|
| 113 |
progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
|
| 114 |
|
| 115 |
# Load tokenizer
|
| 116 |
+
tokenizer = setup_tokenizer(model_path)
|
| 117 |
|
| 118 |
# Add pad token if missing
|
| 119 |
if tokenizer.pad_token is None:
|