Spaces:

PeterPinetree
/

Next-Token-Predictor

Sleeping

App Files Files Community

PeterPinetree commited on Sep 17

Commit

bf08f52

1 Parent(s): 71d95c1

Update to correct Inference Providers chat completions API format

Browse files

Files changed (1) hide show

app.py +42 -13

app.py CHANGED Viewed

@@ -10,8 +10,8 @@ from dotenv import load_dotenv
 load_dotenv()
 # Configuration
-API_BASE = "https://router.huggingface.co/hf-inference/models/"
-MODEL_ID = "openai-community/gpt2"
 HF_TOKEN = os.getenv('HF_NEXT_TOKEN_PREDICTOR_TOKEN', '')
 def show_token(token: str) -> str:
@@ -34,20 +34,20 @@ def predict_next_token(text: str, top_k: int = 10, hide_punctuation: bool = Fals
     start_time = time.time()
     try:
-        # Call Hugging Face Serverless Inference API
-        url = f"{API_BASE}{MODEL_ID}"
         headers = {
             'Authorization': f'Bearer {HF_TOKEN}',
             'Content-Type': 'application/json',
         }
         payload = {
             'inputs': text,
             'parameters': {
                 'max_new_tokens': 1,
                 'do_sample': False,
-                'return_full_text': False,
-                'details': True,
-                'top_k': min(top_k, 50)  # API limitation
             }
         }
@@ -60,10 +60,10 @@ def predict_next_token(text: str, top_k: int = 10, hide_punctuation: bool = Fals
             print(f"Response text: {response.text}")
         if not response.ok:
-            # Try GPT-2 Medium as fallback if the main model fails
-            if MODEL_ID == "openai-community/gpt2":
-                print(f"Main model failed, trying GPT-2 Medium fallback...")
-                fallback_url = f"{API_BASE}openai-community/gpt2-medium"
                 fallback_response = requests.post(fallback_url, headers=headers, json=payload, timeout=30)
                 print(f"Fallback response status: {fallback_response.status_code}")
                 if fallback_response.ok:
@@ -86,8 +86,17 @@ def predict_next_token(text: str, top_k: int = 10, hide_punctuation: bool = Fals
         result = response.json()
         prediction_time = int((time.time() - start_time) * 1000)
-        # Parse response and create token list
-        tokens_html = create_token_display(result, top_k, hide_punctuation)
         return tokens_html, f"Prediction time: {prediction_time}ms"
@@ -98,6 +107,26 @@ def predict_next_token(text: str, top_k: int = 10, hide_punctuation: bool = Fals
     except Exception as e:
         return f"❌ Error: {str(e)}", ""
 def create_token_display(api_result: dict, top_k: int, hide_punctuation: bool) -> str:
     """Create HTML display for predicted tokens"""

 load_dotenv()
 # Configuration
+API_BASE = "https://router.huggingface.co/v1/"
+MODEL_ID = "Qwen/Qwen3-0.6B"
 HF_TOKEN = os.getenv('HF_NEXT_TOKEN_PREDICTOR_TOKEN', '')
 def show_token(token: str) -> str:
     start_time = time.time()
     try:
+        # Call Hugging Face Inference Providers API (Text Generation format)
+        url = f"{API_BASE}text-generation"
         headers = {
             'Authorization': f'Bearer {HF_TOKEN}',
             'Content-Type': 'application/json',
         }
         payload = {
+            'model': MODEL_ID,
             'inputs': text,
             'parameters': {
                 'max_new_tokens': 1,
+                'temperature': 0.0,
                 'do_sample': False,
+                'return_full_text': False
             }
         }
             print(f"Response text: {response.text}")
         if not response.ok:
+            # Try a different Qwen model as fallback if the main model fails
+            if MODEL_ID == "Qwen/Qwen3-0.6B":
+                print(f"Main model failed, trying Qwen2.5-0.5B fallback...")
+                fallback_url = f"{API_BASE}Qwen/Qwen2.5-0.5B-Instruct"
                 fallback_response = requests.post(fallback_url, headers=headers, json=payload, timeout=30)
                 print(f"Fallback response status: {fallback_response.status_code}")
                 if fallback_response.ok:
         result = response.json()
         prediction_time = int((time.time() - start_time) * 1000)
+        # Parse chat completion response - it returns a single message, not probabilities
+        try:
+            predicted_text = result['choices'][0]['message']['content'].strip()
+            # Extract just the next word (in case model returns more)
+            next_word = predicted_text.split()[0] if predicted_text else "?"
+            # Create simple display since we don't have probabilities
+            tokens_html = create_simple_token_display(next_word)
+        except (KeyError, IndexError) as e:
+            return f"❌ Error parsing response: {str(e)}", ""
         return tokens_html, f"Prediction time: {prediction_time}ms"
     except Exception as e:
         return f"❌ Error: {str(e)}", ""
+def create_simple_token_display(predicted_word: str) -> str:
+    """Create HTML display for a single predicted token (chat completions format)"""
+    # Create HTML for single token
+    html = """
+    <div style="font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; background: #0e162b; border: 1px solid #1c2945; border-radius: 14px; padding: 12px;">
+    """
+    token_display = show_token(predicted_word)
+    html += f"""
+    <div style="display: grid; grid-template-columns: 1fr auto; gap: 8px; align-items: center; padding: 8px 10px; margin: 4px 0; border-radius: 10px; background: #0f1930; border: 1px solid #22365e; cursor: pointer;">
+        <div style="color: #e6f1ff; font-size: 14px;">{token_display}</div>
+        <div style="color: #9ab0d0; font-size: 12px;">Predicted</div>
+    </div>
+    """
+    html += "</div>"
+    return html
 def create_token_display(api_result: dict, top_k: int, hide_punctuation: bool) -> str:
     """Create HTML display for predicted tokens"""