Spaces:

r-three
/

quick-tokenizer-accuracy

Running

Gül Sena Altıntaş commited on Jul 30

Commit

cb0e70e

1 Parent(s): aebf6ac

Fixed supertoken tokenizer loading

Files changed (1) hide show

app.py CHANGED Viewed

@@ -73,6 +73,22 @@ def parse_dataset(text):
     error_msg = '\n'.join(errors) if errors else ""
     return questions, error_msg
 def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
     """Load model and tokenizer with caching"""
@@ -97,7 +113,7 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
             progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
         # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, legacy=True)
         # Add pad token if missing
         if tokenizer.pad_token is None:

     error_msg = '\n'.join(errors) if errors else ""
     return questions, error_msg
+def setup_tokenizer(model_path):
+    tokenizer_name = model_path
+    if "supertoken" in model_path:
+        from huggingface_hub import list_repo_files, hf_hub_download
+        import json
+        files = list_repo_files(model_path)
+        if "tokenizer_config.json" in files:
+            tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer_config.json")
+            with open(tokenizer_path) as f:
+                tok_config = json.load(f)["data"]["tokenizer"]
+            if tok_config["name"] == "huggingface":
+                tokenizer_name = tok_config["path"]
+            # todo: tiktoken
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True, legacy=True)
+    return tokenizer
 def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
     """Load model and tokenizer with caching"""
             progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
         # Load tokenizer
+        tokenizer = setup_tokenizer(model_path)
         # Add pad token if missing
         if tokenizer.pad_token is None: