Spaces:

akhaliq
/

MobileLLM-Pro

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 28 days ago

Commit

195d6db

verified ·

1 Parent(s): 221127d

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -116

app.py CHANGED Viewed

@@ -1,12 +1,21 @@
 import os
-import time
-from typing import List, Dict, Tuple, Any
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
-import spaces
 # =========================
 # Configuration
@@ -35,20 +44,21 @@ if HF_TOKEN:
 # =========================
 # Utilities
 # =========================
-def tuples_from_messages(messages: List[Dict[str, Any]]) -> List[List[str]]:
     """
-    Convert a Chatbot(type='messages') style history into tuples format
-    [[user, assistant], ...]. If already tuples-like, return as-is.
     """
     if not messages:
         return []
-    # If already tuples-like (list with elements of length 2), return
     if isinstance(messages[0], (list, tuple)) and len(messages[0]) == 2:
         return [list(x) for x in messages]
-    # Otherwise, convert from [{"role": "...", "content": "..."}, ...]
     pairs: List[List[str]] = []
-    last_user: str | None = None
     for m in messages:
         role = m.get("role")
         content = m.get("content", "")
@@ -56,12 +66,10 @@ def tuples_from_messages(messages: List[Dict[str, Any]]) -> List[List[str]]:
             last_user = content
         elif role == "assistant":
             if last_user is None:
-                # If assistant appears first (odd state), pair with empty user
                 pairs.append(["", content])
             else:
                 pairs.append([last_user, content])
                 last_user = None
-    # If there's a dangling user without assistant, pair with empty string
     if last_user is not None:
         pairs.append([last_user, ""])
     return pairs
@@ -74,7 +82,8 @@ def messages_from_tuples(history_tuples: List[List[str]]) -> List[Dict[str, str]
     """
     messages: List[Dict[str, str]] = []
     for u, a in history_tuples:
-        messages.append({"role": "user", "content": u})
         if a:
             messages.append({"role": "assistant", "content": a})
     return messages
@@ -89,12 +98,16 @@ class MobileLLMChat:
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
         self.load_model(version=MODEL_SUBFOLDER)
-    def load_model(self, version="instruct"):
-        """Load the MobileLLM-Pro model and tokenizer (initially to CPU)."""
         try:
-            print(f"Loading {MODEL_ID} ({version})...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 MODEL_ID, trust_remote_code=True, subfolder=version
             )
@@ -102,91 +115,107 @@ class MobileLLMChat:
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
-                torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
             )
-            # Safety: ensure pad token exists (some LLMs don't set it)
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
             self.model.eval()
             self.model_loaded = True
-            print("Model loaded successfully to system memory (CPU).")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
             return False
     def format_chat_history(
-        self, history: List[Dict[str, str]], system_prompt: str
     ) -> List[Dict[str, str]]:
-        """Format chat history for tokenizer's chat template."""
         messages = [{"role": "system", "content": system_prompt}]
-        # Truncate to keep the last N turns
-        trimmed = []
-        for msg in history:
-            if msg["role"] in ("user", "assistant"):
-                trimmed.append(msg)
         if MAX_HISTORY_LENGTH > 0:
             trimmed = trimmed[-(MAX_HISTORY_LENGTH * 2) :]
         messages.extend(trimmed)
         return messages
-    @spaces.GPU(duration=120)
-    def generate_response(
         self,
         user_input: str,
-        history: List[Dict[str, str]],
         system_prompt: str,
         temperature: float = 0.7,
         max_new_tokens: int = MAX_NEW_TOKENS,
     ) -> str:
-        """Generate a full response (GPU during inference)."""
         if not self.model_loaded:
-            return "Model not loaded. Please try reloading the space."
         try:
-            # Choose device (Spaces GPU if available)
-            use_cuda = torch.cuda.is_available()
-            self.device = torch.device("cuda" if use_cuda else "cpu")
-            self.model.to(self.device)
-            # Append the new user message
-            history.append({"role": "user", "content": user_input})
-            messages = self.format_chat_history(history, system_prompt)
-            # Build inputs with chat template
-            input_ids = self.tokenizer.apply_chat_template(
                 messages, return_tensors="pt", add_generation_prompt=True
-            ).to(self.device)
-            # No padding used here -> full ones mask
-            attention_mask = torch.ones_like(input_ids)
             with torch.no_grad():
                 outputs = self.model.generate(
                     input_ids,
-                    attention_mask=attention_mask,
                     max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                 )
-            # Slice only the newly generated tokens
             gen_ids = outputs[0][input_ids.shape[1] :]
-            response = self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
-            # Update history (internal state for the caller if desired)
-            history.append({"role": "assistant", "content": response})
-            # Free GPU VRAM
-            if use_cuda:
-                self.model.to("cpu")
-                torch.cuda.empty_cache()
-            return response
-        except Exception as e:
-            return f"Error generating response: {str(e)}"
 # =========================
@@ -199,65 +228,44 @@ chat_model = MobileLLMChat()
 # =========================
 # Gradio Helpers
 # =========================
 def clear_chat():
-    """Clear the chat history and input box."""
     return [], ""
-def chat_fn(message, history, system_prompt, temperature):
     """Non-streaming chat handler (returns tuples)."""
-    # DEFENSIVE: ensure tuples format
     history = tuples_from_messages(history)
     if not chat_model.model_loaded:
         return history + [[message, "Please wait for the model to load or reload the space."]]
-    # Convert tuples -> role dicts for the model
     formatted_history = messages_from_tuples(history)
-    # Generate full response once
-    response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
-    # Return updated tuples history
     return history + [[message, response]]
-def chat_stream_fn(message, history, system_prompt, temperature):
-    """Streaming chat handler (tuples): generate once, then chunk out."""
-    # DEFENSIVE: ensure tuples format
     history = tuples_from_messages(history)
     if not chat_model.model_loaded:
         yield history + [[message, "Please wait for the model to load or reload the space."]]
         return
-    # Convert tuples -> role dicts for the model
     formatted_history = messages_from_tuples(history)
-    # Generate full response (GPU)
-    full_response = chat_model.generate_response(
-        message, formatted_history, system_prompt, temperature
-    )
-    # Start new row and progressively fill assistant side
     base = history + [[message, ""]]
-    if not isinstance(full_response, str):
-        full_response = str(full_response)
-    step = max(8, len(full_response) // 40)  # ~40 chunks
-    for i in range(0, len(full_response), step):
-        partial = full_response[: i + step]
-        yield base[:-1] + [[message, partial]]
-    # Final ensure complete
-    yield base[:-1] + [[message, full_response]]
-def handle_chat(message, history, system_prompt, temperature, streaming):
     return (
-        chat_stream_fn(message, history, system_prompt, temperature)
         if streaming
-        else chat_fn(message, history, system_prompt, temperature)
     )
@@ -275,18 +283,16 @@ with gr.Blocks(
     """
 ) as demo:
-    # Header
     gr.HTML(
         """
-        <div style="text-align: center; margin-bottom: 20px;">
             <h1>🤖 MobileLLM-Pro Chat</h1>
-            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
             <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
         </div>
         """
     )
-    # Model status
     with gr.Row():
         model_status = gr.Textbox(
             label="Model Status",
@@ -295,7 +301,6 @@ with gr.Blocks(
             container=True,
         )
-    # Config
     with gr.Accordion("⚙️ Configuration", open=False):
         with gr.Row():
             system_prompt = gr.Textbox(
@@ -306,20 +311,27 @@ with gr.Blocks(
             )
         with gr.Row():
             temperature = gr.Slider(
-                minimum=0.1,
                 maximum=2.0,
                 value=0.7,
-                step=0.1,
                 label="Temperature",
                 info="Controls randomness (higher = more creative)",
             )
             streaming = gr.Checkbox(
                 value=True,
                 label="Enable Streaming",
                 info="Show responses as they're being generated",
             )
-    # Chatbot in TUPLES mode (explicit)
     chatbot = gr.Chatbot(
         type="tuples",
         label="Chat History",
@@ -337,16 +349,15 @@ with gr.Blocks(
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
-    # Wire events (also clear the input box after send)
     msg.submit(
         handle_chat,
-        inputs=[msg, chatbot, system_prompt, temperature, streaming],
         outputs=[chatbot],
     ).then(lambda: "", None, msg)
     submit_btn.click(
         handle_chat,
-        inputs=[msg, chatbot, system_prompt, temperature, streaming],
         outputs=[chatbot],
     ).then(lambda: "", None, msg)
@@ -355,7 +366,6 @@ with gr.Blocks(
         outputs=[chatbot, msg],
     )
-    # Examples
     gr.Examples(
         examples=[
             ["What are the benefits of on-device AI models?"],
@@ -368,22 +378,17 @@ with gr.Blocks(
         label="Example Prompts",
     )
-    # Footer
     gr.HTML(
         """
-        <div style="text-align: center; margin-top: 20px; color: #666;">
             <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
-            <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
         </div>
         """
     )
-# Optional: queue to improve streaming UX
 demo.queue()
-# Launch (NO share=True on Spaces)
 if __name__ == "__main__":
-    demo.launch(
-        show_error=True,
-        debug=True,
-    )

 import os
+import threading
+from typing import List, Dict, Tuple, Any, Optional
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from huggingface_hub import login
+# --- Optional: Hugging Face Spaces GPU decorator (safe locally) ---
+try:
+    import spaces  # type: ignore
+    GPU_DECORATOR = spaces.GPU
+except Exception:  # running locally without `spaces`
+    def GPU_DECORATOR(*args, **kwargs):  # no-op decorator
+        def _wrap(fn):
+            return fn
+        return _wrap
 # =========================
 # Configuration
 # =========================
 # Utilities
 # =========================
+def tuples_from_messages(messages: List[Any]) -> List[List[str]]:
     """
+    Normalize a Chatbot history to tuples [[user, assistant], ...].
+    Accepts either tuples-style or messages-style ({role, content}) lists.
     """
     if not messages:
         return []
+    # Already tuples-like
     if isinstance(messages[0], (list, tuple)) and len(messages[0]) == 2:
         return [list(x) for x in messages]
+    # Convert from messages-style
     pairs: List[List[str]] = []
+    last_user: Optional[str] = None
     for m in messages:
         role = m.get("role")
         content = m.get("content", "")
             last_user = content
         elif role == "assistant":
             if last_user is None:
                 pairs.append(["", content])
             else:
                 pairs.append([last_user, content])
                 last_user = None
     if last_user is not None:
         pairs.append([last_user, ""])
     return pairs
     """
     messages: List[Dict[str, str]] = []
     for u, a in history_tuples:
+        if u:
+            messages.append({"role": "user", "content": u})
         if a:
             messages.append({"role": "assistant", "content": a})
     return messages
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
+        self.version = None
         self.load_model(version=MODEL_SUBFOLDER)
+    def load_model(self, version: str = "instruct") -> bool:
+        """Load tokenizer+model; choose dtype/device_map safely for CPU/GPU."""
         try:
+            print(f"Loading {MODEL_ID} ({version}) ...")
+            use_cuda = torch.cuda.is_available()
+            torch_dtype = torch.float16 if use_cuda else torch.float32
             self.tokenizer = AutoTokenizer.from_pretrained(
                 MODEL_ID, trust_remote_code=True, subfolder=version
             )
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
+                torch_dtype=torch_dtype,
                 low_cpu_mem_usage=True,
+                device_map="auto" if use_cuda else None,
             )
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
             self.model.eval()
+            self.version = version
+            self.device = next(self.model.parameters()).device
             self.model_loaded = True
+            print("Model loaded successfully.")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
+            self.model_loaded = False
             return False
     def format_chat_history(
+        self, history_msgs: List[Dict[str, str]], system_prompt: str
     ) -> List[Dict[str, str]]:
         messages = [{"role": "system", "content": system_prompt}]
+        trimmed = [m for m in history_msgs if m.get("role") in ("user", "assistant")]
         if MAX_HISTORY_LENGTH > 0:
             trimmed = trimmed[-(MAX_HISTORY_LENGTH * 2) :]
         messages.extend(trimmed)
         return messages
+    @GPU_DECORATOR(duration=120)
+    def generate_once(
         self,
         user_input: str,
+        history_msgs: List[Dict[str, str]],
         system_prompt: str,
         temperature: float = 0.7,
         max_new_tokens: int = MAX_NEW_TOKENS,
+        top_p: float = 0.95,
     ) -> str:
+        """Single-shot generation (no streaming)."""
         if not self.model_loaded:
+            return "Model not loaded. Please reload."
         try:
+            messages = self.format_chat_history(history_msgs + [{"role": "user", "content": user_input}], system_prompt)
+            inputs = self.tokenizer.apply_chat_template(
                 messages, return_tensors="pt", add_generation_prompt=True
+            )
+            input_ids = inputs if isinstance(inputs, torch.Tensor) else inputs["input_ids"]
+            input_ids = input_ids.to(self.device)
             with torch.no_grad():
                 outputs = self.model.generate(
                     input_ids,
                     max_new_tokens=max_new_tokens,
+                    temperature=float(temperature),
+                    do_sample=temperature > 0,
+                    top_p=float(top_p),
+                    pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                 )
             gen_ids = outputs[0][input_ids.shape[1] :]
+            return self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+        except Exception as e:
+            return f"Error generating response: {e}"
+    @GPU_DECORATOR(duration=120)
+    def stream_generate(
+        self,
+        user_input: str,
+        history_msgs: List[Dict[str, str]],
+        system_prompt: str,
+        temperature: float = 0.7,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+        top_p: float = 0.95,
+    ):
+        """Streaming generator using TextIteratorStreamer."""
+        messages = self.format_chat_history(history_msgs + [{"role": "user", "content": user_input}], system_prompt)
+        inputs = self.tokenizer.apply_chat_template(
+            messages, return_tensors="pt", add_generation_prompt=True
+        )
+        input_ids = inputs if isinstance(inputs, torch.Tensor) else inputs["input_ids"]
+        input_ids = input_ids.to(self.device)
+        streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=True)
+        gen_kwargs = dict(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            temperature=float(temperature),
+            do_sample=temperature > 0,
+            top_p=float(top_p),
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            streamer=streamer,
+        )
+        thread = threading.Thread(target=self.model.generate, kwargs=gen_kwargs)
+        thread.start()
+        partial = ""
+        for text in streamer:
+            partial += text
+            yield partial
 # =========================
 # =========================
 # Gradio Helpers
 # =========================
 def clear_chat():
     return [], ""
+def chat_fn(message, history, system_prompt, temperature, top_p):
     """Non-streaming chat handler (returns tuples)."""
     history = tuples_from_messages(history)
     if not chat_model.model_loaded:
         return history + [[message, "Please wait for the model to load or reload the space."]]
     formatted_history = messages_from_tuples(history)
+    response = chat_model.generate_once(message, formatted_history, system_prompt, temperature, MAX_NEW_TOKENS, top_p)
     return history + [[message, response]]
+def chat_stream_fn(message, history, system_prompt, temperature, top_p):
+    """Streaming chat handler: yields updated tuples as tokens arrive."""
     history = tuples_from_messages(history)
     if not chat_model.model_loaded:
         yield history + [[message, "Please wait for the model to load or reload the space."]]
         return
     formatted_history = messages_from_tuples(history)
+    # Start a new row for the assistant and fill progressively
     base = history + [[message, ""]]
+    for chunk in chat_model.stream_generate(message, formatted_history, system_prompt, temperature, MAX_NEW_TOKENS, top_p):
+        yield base[:-1] + [[message, chunk]]
+    # Ensure completion (in case streamer ended exactly on boundary)
+    # No extra yield needed; last chunk already yielded.
+def handle_chat(message, history, system_prompt, temperature, top_p, streaming):
     return (
+        chat_stream_fn(message, history, system_prompt, temperature, top_p)
         if streaming
+        else chat_fn(message, history, system_prompt, temperature, top_p)
     )
     """
 ) as demo:
     gr.HTML(
         """
+        <div style=\"text-align: center; margin-bottom: 20px;\">
             <h1>🤖 MobileLLM-Pro Chat</h1>
+            <p>Built with <a href=\"https://huggingface.co/spaces/akhaliq/anycoder\" target=\"_blank\">anycoder</a></p>
             <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
         </div>
         """
     )
     with gr.Row():
         model_status = gr.Textbox(
             label="Model Status",
             container=True,
         )
     with gr.Accordion("⚙️ Configuration", open=False):
         with gr.Row():
             system_prompt = gr.Textbox(
             )
         with gr.Row():
             temperature = gr.Slider(
+                minimum=0.0,
                 maximum=2.0,
                 value=0.7,
+                step=0.05,
                 label="Temperature",
                 info="Controls randomness (higher = more creative)",
             )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.95,
+                step=0.01,
+                label="Top-p",
+                info="Nucleus sampling threshold",
+            )
             streaming = gr.Checkbox(
                 value=True,
                 label="Enable Streaming",
                 info="Show responses as they're being generated",
             )
     chatbot = gr.Chatbot(
         type="tuples",
         label="Chat History",
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
     msg.submit(
         handle_chat,
+        inputs=[msg, chatbot, system_prompt, temperature, top_p, streaming],
         outputs=[chatbot],
     ).then(lambda: "", None, msg)
     submit_btn.click(
         handle_chat,
+        inputs=[msg, chatbot, system_prompt, temperature, top_p, streaming],
         outputs=[chatbot],
     ).then(lambda: "", None, msg)
         outputs=[chatbot, msg],
     )
     gr.Examples(
         examples=[
             ["What are the benefits of on-device AI models?"],
         label="Example Prompts",
     )
     gr.HTML(
         """
+        <div style=\"text-align: center; margin-top: 20px; color: #666;\">
             <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
+            <p>Model: <a href=\"https://huggingface.co/facebook/MobileLLM-Pro\" target=\"_blank\">facebook/MobileLLM-Pro</a></p>
         </div>
         """
     )
+# Improve streaming UX
 demo.queue()
 if __name__ == "__main__":
+    demo.launch(show_error=True, debug=True)