Spaces:

PeterPinetree
/

Next-Token-Predictor

Running

App Files Files Community

PeterPinetree commited on Sep 18

Commit

a097df3

1 Parent(s): bf08f52

Refactor to use local model inference for next token prediction and enhance token display functionality

Browse files

Files changed (2) hide show

app.py +182 -149
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,19 +1,24 @@
 import gradio as gr
-import requests
 import json
 import os
 import time
 from typing import List, Dict, Tuple
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 # Configuration
-API_BASE = "https://router.huggingface.co/v1/"
 MODEL_ID = "Qwen/Qwen3-0.6B"
 HF_TOKEN = os.getenv('HF_NEXT_TOKEN_PREDICTOR_TOKEN', '')
 def show_token(token: str) -> str:
     """Format token for display"""
     if token == "\n":
@@ -22,152 +27,96 @@ def show_token(token: str) -> str:
         return f"␣{'' if len(token) == 1 else '×' + str(len(token))}"
     return token
-def predict_next_token(text: str, top_k: int = 10, hide_punctuation: bool = False) -> Tuple[str, str]:
-    """Predict next tokens using HF Serverless API"""
-    if not HF_TOKEN:
-        return "❌ No HF_NEXT_TOKEN_PREDICTOR_TOKEN found in environment variables", ""
     if not text.strip():
-        return "Please enter some text to predict from", ""
     start_time = time.time()
     try:
-        # Call Hugging Face Inference Providers API (Text Generation format)
-        url = f"{API_BASE}text-generation"
-        headers = {
-            'Authorization': f'Bearer {HF_TOKEN}',
-            'Content-Type': 'application/json',
-        }
-        payload = {
-            'model': MODEL_ID,
-            'inputs': text,
-            'parameters': {
-                'max_new_tokens': 1,
-                'temperature': 0.0,
-                'do_sample': False,
-                'return_full_text': False
-            }
-        }
-        response = requests.post(url, headers=headers, json=payload, timeout=30)
-        # Debug logging
-        print(f"API URL: {url}")
-        print(f"Response status: {response.status_code}")
-        if not response.ok:
-            print(f"Response text: {response.text}")
-        if not response.ok:
-            # Try a different Qwen model as fallback if the main model fails
-            if MODEL_ID == "Qwen/Qwen3-0.6B":
-                print(f"Main model failed, trying Qwen2.5-0.5B fallback...")
-                fallback_url = f"{API_BASE}Qwen/Qwen2.5-0.5B-Instruct"
-                fallback_response = requests.post(fallback_url, headers=headers, json=payload, timeout=30)
-                print(f"Fallback response status: {fallback_response.status_code}")
-                if fallback_response.ok:
-                    response = fallback_response
-                    print("✅ Fallback successful!")
-                else:
-                    print(f"Fallback also failed: {fallback_response.text[:100]}")
-            # If still not ok after fallback attempt
-            if not response.ok:
-                error_msg = f"API Error: {response.status_code} for model {MODEL_ID}"
-                try:
-                    error_detail = response.json()
-                    if 'error' in error_detail:
-                        error_msg += f" - {error_detail['error']}"
-                except:
-                    error_msg += f" - {response.text[:200]}"
-                return error_msg, ""
-        result = response.json()
-        prediction_time = int((time.time() - start_time) * 1000)
-        # Parse chat completion response - it returns a single message, not probabilities
-        try:
-            predicted_text = result['choices'][0]['message']['content'].strip()
-            # Extract just the next word (in case model returns more)
-            next_word = predicted_text.split()[0] if predicted_text else "?"
-            # Create simple display since we don't have probabilities
-            tokens_html = create_simple_token_display(next_word)
-        except (KeyError, IndexError) as e:
-            return f"❌ Error parsing response: {str(e)}", ""
-        return tokens_html, f"Prediction time: {prediction_time}ms"
-    except requests.exceptions.Timeout:
-        return "❌ API request timed out. The model might be loading - try again in a moment.", ""
-    except requests.exceptions.RequestException as e:
-        return f"❌ Network error: {str(e)}", ""
     except Exception as e:
-        return f"❌ Error: {str(e)}", ""
-def create_simple_token_display(predicted_word: str) -> str:
-    """Create HTML display for a single predicted token (chat completions format)"""
-    # Create HTML for single token
-    html = """
-    <div style="font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; background: #0e162b; border: 1px solid #1c2945; border-radius: 14px; padding: 12px;">
-    """
-    token_display = show_token(predicted_word)
-    html += f"""
-    <div style="display: grid; grid-template-columns: 1fr auto; gap: 8px; align-items: center; padding: 8px 10px; margin: 4px 0; border-radius: 10px; background: #0f1930; border: 1px solid #22365e; cursor: pointer;">
-        <div style="color: #e6f1ff; font-size: 14px;">{token_display}</div>
-        <div style="color: #9ab0d0; font-size: 12px;">Predicted</div>
-    </div>
-    """
-    html += "</div>"
-    return html
-def create_token_display(api_result: dict, top_k: int, hide_punctuation: bool) -> str:
-    """Create HTML display for predicted tokens"""
-    # For demo purposes, create some example predictions
-    # In a real implementation, you'd parse the API response properly
-    demo_tokens = [
-        {"token": "star", "prob": 0.35},
-        {"token": "light", "prob": 0.25},
-        {"token": "night", "prob": 0.15},
-        {"token": "sky", "prob": 0.10},
-        {"token": "bright", "prob": 0.08},
-        {"token": "moon", "prob": 0.04},
-        {"token": "sun", "prob": 0.03}
-    ]
-    # Filter punctuation if requested
-    if hide_punctuation:
-        import re
-        PUNC_ONLY = re.compile(r'^[\s.,;:!?—-]+$')
-        demo_tokens = [t for t in demo_tokens if not PUNC_ONLY.match(t['token'])]
-    # Take only top_k
-    tokens = demo_tokens[:top_k]
-    # Create HTML
     html = """
-    <div style="font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; background: #0e162b; border: 1px solid #1c2945; border-radius: 14px; padding: 12px;">
     """
-    for token in tokens:
-        token_display = show_token(token['token'])
-        percentage = f"{token['prob'] * 100:.2f}%"
         html += f"""
-        <div style="display: grid; grid-template-columns: 1fr auto; gap: 8px; align-items: center; padding: 8px 10px; margin: 4px 0; border-radius: 10px; background: #0f1930; border: 1px solid #22365e; cursor: pointer;">
             <div style="color: #e6f1ff; font-size: 14px;">{token_display}</div>
             <div style="color: #9ab0d0; font-size: 12px;">{percentage}</div>
         </div>
         """
-    html += "</div>"
     return html
 # Custom CSS to match the original design
@@ -187,6 +136,37 @@ custom_css = """
     background: #0e1629 !important;
     border: 1px solid #1c2945 !important;
 }
 """
 # Create Gradio interface
@@ -194,17 +174,10 @@ with gr.Blocks(css=custom_css, title="Next-Token Predictor") as app:
     gr.HTML("""
     <div style="text-align: center; padding: 20px; background: #0e1629; border-bottom: 1px solid #1c2945;">
         <h1 style="color: #e6f1ff; margin: 0; font-size: 24px;">🤗 Next-Token Predictor</h1>
-        <p style="color: #9ab0d0; margin: 10px 0 0 0;">Explore how AI predicts the next word! Predictions update automatically as you type.</p>
     </div>
     """)
-    if not HF_TOKEN:
-        gr.HTML("""
-        <div style="background: #ffb4c0; color: #000; padding: 10px; border-radius: 8px; margin: 10px;">
-            ⚠️ <strong>HF_NEXT_TOKEN_PREDICTOR_TOKEN not found!</strong> Please set your Hugging Face token as an environment variable or Space secret.
-        </div>
-        """)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
@@ -218,41 +191,101 @@ with gr.Blocks(css=custom_css, title="Next-Token Predictor") as app:
             with gr.Row():
                 top_k = gr.Slider(
                     minimum=5,
-                    maximum=30,
                     value=10,
                     step=1,
-                    label="Top-K predictions",
-                    info="How many predictions to show"
                 )
-                hide_punct = gr.Checkbox(
-                    label="Hide punctuation-only tokens",
-                    value=False,
-                    info="Focus on meaningful words"
                 )
             timing_info = gr.HTML(value="<div style='color: #9ab0d0; font-size: 12px;'>✨ Predictions update as you type!</div>")
         with gr.Column(scale=1):
-            predictions_html = gr.HTML(label="🔮 Next Token Predictions")
-    # Event handlers - auto-prediction on any change
-    def update_predictions(text, k, hide_p):
-        result_html, timing = predict_next_token(text, int(k), hide_p)
-        return result_html, timing
     # Auto-predict on any input change
-    for component in [text_input, top_k, hide_punct]:
         component.change(
-            update_predictions,
-            inputs=[text_input, top_k, hide_punct],
-            outputs=[predictions_html, timing_info]
         )
     # Load initial predictions on app start
     app.load(
-        lambda: update_predictions("Twinkle, twinkle, little ", 10, False),
-        outputs=[predictions_html, timing_info]
     )
 if __name__ == "__main__":
-    app.launch(share=False, server_port=7860)

 import gradio as gr
 import json
 import os
 import time
+import torch
 from typing import List, Dict, Tuple
 from dotenv import load_dotenv
+from transformers import AutoTokenizer, AutoModelForCausalLM
 # Load environment variables from .env file
 load_dotenv()
 # Configuration
 MODEL_ID = "Qwen/Qwen3-0.6B"
 HF_TOKEN = os.getenv('HF_NEXT_TOKEN_PREDICTOR_TOKEN', '')
+# Initialize model and tokenizer (local inference like the working app)
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 def show_token(token: str) -> str:
     """Format token for display"""
     if token == "\n":
         return f"␣{'' if len(token) == 1 else '×' + str(len(token))}"
     return token
+def predict_next_token(text: str, top_k: int = 10, temperature: float = 1.0, top_p: float = 0.9) -> Tuple[List[Dict], str]:
+    """Predict next tokens using local model with temperature and top-p filtering"""
     if not text.strip():
+        return [], "Please enter some text to predict from"
     start_time = time.time()
     try:
+        # Use local model inference
+        tokens = tokenizer(text, return_tensors="pt", padding=False)
+        out = model.generate(
+            **tokens,
+            max_new_tokens=1,
+            output_scores=True,
+            return_dict_in_generate=True,
+            pad_token_id=tokenizer.eos_token_id,
+            do_sample=False,
+        )
+        # Get raw logits and apply temperature scaling
+        logits = out.scores[0]
+        scaled_logits = logits / temperature
+        scores = torch.softmax(scaled_logits, dim=-1)
+        # Apply top-p filtering (nucleus sampling)
+        sorted_probs, sorted_indices = torch.sort(scores, descending=True)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        # Find the cutoff point for top-p
+        cutoff_index = torch.where(cumulative_probs >= top_p)[1]
+        if len(cutoff_index) > 0:
+            cutoff = cutoff_index[0].item() + 1
+            top_p_indices = sorted_indices[0, :cutoff]
+            top_p_probs = sorted_probs[0, :cutoff]
+        else:
+            # Fallback if top_p is very low
+            top_p_indices = sorted_indices[0, :min(50, len(sorted_indices[0]))]
+            top_p_probs = sorted_probs[0, :min(50, len(sorted_probs[0]))]
+        # Apply top-k to the top-p filtered results
+        final_k = min(top_k, len(top_p_indices))
+        final_indices = top_p_indices[:final_k]
+        final_probs = top_p_probs[:final_k]
+        # Convert to tokens
+        token_ids = [int(idx) for idx in final_indices]
+        probs = [float(prob) for prob in final_probs]
+        tokens_text = [tokenizer.decode([tid]) for tid in token_ids]
+        # Create token data structure
+        tokens_data = []
+        for i in range(len(token_ids)):
+            tokens_data.append({
+                "token": tokens_text[i],
+                "prob": probs[i]
+            })
+        prediction_time = int((time.time() - start_time) * 1000)
+        return tokens_data, f"Prediction time: {prediction_time}ms"
     except Exception as e:
+        return [], f"❌ Error: {str(e)}"
+def create_clickable_token_display(tokens_data: List[Dict]) -> str:
+    """Create HTML display with clickable tokens - simplified without JavaScript"""
     html = """
+    <div id="token-predictions" style="font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; background: #0e162b; border: 1px solid #1c2945; border-radius: 14px; padding: 12px;">
     """
+    for i, token_data in enumerate(tokens_data):
+        token_display = show_token(token_data['token'])
+        percentage = f"{token_data['prob'] * 100:.2f}%"
         html += f"""
+        <div class="token-prediction" data-token="{token_data['token']}"
+             style="display: grid; grid-template-columns: 1fr auto; gap: 8px; align-items: center; padding: 8px 10px; margin: 4px 0; border-radius: 10px; background: #0f1930; border: 1px solid #22365e; cursor: pointer; transition: background 0.2s;"
+             onmouseover="this.style.background='#1a2b4a'"
+             onmouseout="this.style.background='#0f1930'">
             <div style="color: #e6f1ff; font-size: 14px;">{token_display}</div>
             <div style="color: #9ab0d0; font-size: 12px;">{percentage}</div>
         </div>
         """
+    html += """
+    </div>
+    """
     return html
 # Custom CSS to match the original design
     background: #0e1629 !important;
     border: 1px solid #1c2945 !important;
 }
+.token-button {
+    background: #0f1930 !important;
+    border: 1px solid #22365e !important;
+    color: #e6f1ff !important;
+    border-radius: 6px !important;
+    margin: 0px !important;
+    padding: 2px 6px !important;
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace !important;
+    transition: background 0.2s !important;
+    font-size: 12px !important;
+}
+.token-button:hover {
+    background: #1a2b4a !important;
+}
+/* Remove Gradio's default spacing between buttons */
+.token-button + .token-button {
+    margin-top: 0px !important;
+}
+/* Remove gaps in the column containing buttons */
+div:has(> .token-button) {
+    gap: 0px !important;
+}
+/* Target Gradio's automatic spacing */
+.block > div > div {
+    gap: 0px !important;
+}
 """
 # Create Gradio interface
     gr.HTML("""
     <div style="text-align: center; padding: 20px; background: #0e1629; border-bottom: 1px solid #1c2945;">
         <h1 style="color: #e6f1ff; margin: 0; font-size: 24px;">🤗 Next-Token Predictor</h1>
+        <p style="color: #9ab0d0; margin: 10px 0 0 0;">Explore how AI predicts the next word! Click on predictions to append them.</p>
     </div>
     """)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
             with gr.Row():
                 top_k = gr.Slider(
                     minimum=5,
+                    maximum=15,
                     value=10,
                     step=1,
+                    label="Top-K",
+                    info="How many top predictions to show",
+                    show_label=True,
+                    interactive=True
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Temperature",
+                    info="Creativity: Low=predictable, High=surprising",
+                    show_label=True,
+                    interactive=True
                 )
+                top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                    label="Top-P",
+                    info="Consider words making up this % of probability",
+                    show_label=True,
+                    interactive=True
                 )
             timing_info = gr.HTML(value="<div style='color: #9ab0d0; font-size: 12px;'>✨ Predictions update as you type!</div>")
         with gr.Column(scale=1):
+            # Create a column for token buttons
+            with gr.Column():
+                gr.HTML("<h4 style='color: #e6f1ff; margin: 0;'>🔮 Next Token Predictions</h4>")
+                # Create buttons for each possible token (we'll show/hide as needed)
+                token_buttons = []
+                for i in range(15):  # Support up to 15 tokens
+                    btn = gr.Button(
+                        value="",
+                        visible=False,
+                        elem_classes=["token-button"],
+                        size="sm"
+                    )
+                    token_buttons.append(btn)
+    # Store current tokens data
+    current_tokens = gr.State([])
+    def update_predictions_and_buttons(text, k, temp, p):
+        tokens_data, timing = predict_next_token(text, int(k), float(temp), float(p))
+        # Update button states
+        button_updates = []
+        for i in range(15):
+            if i < len(tokens_data):
+                token = tokens_data[i]['token']
+                prob = tokens_data[i]['prob']
+                display_token = show_token(token)
+                button_label = f"{display_token} ({prob*100:.1f}%)"
+                button_updates.append(gr.Button(value=button_label, visible=True))
+            else:
+                button_updates.append(gr.Button(visible=False))
+        return [timing, tokens_data] + button_updates
+    def append_token_to_input(current_text, tokens_data, button_index):
+        if tokens_data and 0 <= button_index < len(tokens_data):
+            token = tokens_data[button_index]['token']
+            return current_text + token
+        return current_text
     # Auto-predict on any input change
+    outputs = [timing_info, current_tokens] + token_buttons
+    for component in [text_input, top_k, temperature, top_p]:
         component.change(
+            update_predictions_and_buttons,
+            inputs=[text_input, top_k, temperature, top_p],
+            outputs=outputs
+        )
+    # Set up click handlers for each token button
+    for i, btn in enumerate(token_buttons):
+        btn.click(
+            lambda text, tokens, idx=i: append_token_to_input(text, tokens, idx),
+            inputs=[text_input, current_tokens],
+            outputs=[text_input]
         )
     # Load initial predictions on app start
     app.load(
+        lambda: update_predictions_and_buttons("Twinkle, twinkle, little ", 10, 1.0, 0.9),
+        outputs=outputs
     )
 if __name__ == "__main__":
+    app.launch(share=False)

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 gradio==4.44.1
 requests==2.31.0
-python-dotenv==1.0.0

 gradio==4.44.1
 requests==2.31.0
+python-dotenv==1.0.0
+transformers
+torch
+anywidget
+traitlets