Spaces:

akhaliq
/

MobileLLM-Pro

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 14 days ago

Commit

98da568

verified ·

1 Parent(s): 74cb54c

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -233

app.py CHANGED Viewed

@@ -1,19 +1,28 @@
-import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
-import os
-from typing import List, Dict, Any
-import time
 import spaces
 # Configuration
 MODEL_ID = "facebook/MobileLLM-Pro"
 MAX_HISTORY_LENGTH = 10
 MAX_NEW_TOKENS = 512
-DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly, and intelligent assistant. Provide clear, accurate, and thoughtful responses."
-# Login to Hugging Face (if token is provided)
 HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN:
     try:
@@ -22,289 +31,234 @@ if HF_TOKEN:
     except Exception as e:
         print(f"Warning: Could not login to Hugging Face: {e}")
 class MobileLLMChat:
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
-        # Load model on initialization for shared app
-        self.load_model()
     def load_model(self, version="instruct"):
-        """Load the MobileLLM-Pro model and tokenizer - runs once on CPU/system memory"""
         try:
-            print(f"Loading MobileLLM-Pro ({version})...")
-            # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_ID,
-                trust_remote_code=True,
-                subfolder=version
             )
-            # Load model to CPU first for shared app
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
                 torch_dtype=torch.float16,
-                low_cpu_mem_usage=True
             )
-            # Model will be moved to GPU during inference
             self.model.eval()
             self.model_loaded = True
-            print(f"Model loaded successfully in system memory")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
             return False
-    def format_chat_history(self, history: List[Dict[str, str]], system_prompt: str) -> List[Dict[str, str]]:
-        """Format chat history for the model"""
         messages = [{"role": "system", "content": system_prompt}]
         for msg in history:
-            if msg["role"] in ["user", "assistant"]:
-                messages.append(msg)
         return messages
     @spaces.GPU(duration=120)
-    def generate_response(self, user_input: str, history: List[Dict[str, str]],
-                         system_prompt: str, temperature: float = 0.7,
-                         max_new_tokens: int = MAX_NEW_TOKENS) -> str:
-        """Generate a response from the model - GPU allocated only during inference"""
         if not self.model_loaded:
             return "Model not loaded. Please try reloading the space."
         try:
-            # Move model to GPU for inference
-            self.device = torch.device("cuda")
             self.model.to(self.device)
-            # Add user message to history
             history.append({"role": "user", "content": user_input})
-            # Format messages
             messages = self.format_chat_history(history, system_prompt)
-            # Apply chat template
-            inputs = self.tokenizer.apply_chat_template(
-                messages,
-                return_tensors="pt",
-                add_generation_prompt=True
             ).to(self.device)
-            # Generate response
             with torch.no_grad():
                 outputs = self.model.generate(
-                    inputs,
                     max_new_tokens=max_new_tokens,
                     temperature=temperature,
                     do_sample=True,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                 )
-            # Decode response
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract only the new response (remove input)
-            if response.startswith(messages[0]["content"]):
-                response = response[len(messages[0]["content"]):].strip()
-            # Remove the user input from the response
-            if user_input in response:
-                response = response.replace(user_input, "").strip()
-            # Clean up common prefixes
-            prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
-            for prefix in prefixes_to_remove:
-                if response.lower().startswith(prefix.lower()):
-                    response = response[len(prefix):].strip()
-            # Add assistant response to history
             history.append({"role": "assistant", "content": response})
-            # Move model back to CPU after inference to free GPU
-            self.model.to("cpu")
-            torch.cuda.empty_cache()
             return response
         except Exception as e:
             return f"Error generating response: {str(e)}"
-    @spaces.GPU(duration=120)
-    def generate_stream(self, user_input: str, history: List[Dict[str, str]],
-                       system_prompt: str, temperature: float = 0.7):
-        """Generate a streaming response from the model - GPU allocated only during inference"""
-        if not self.model_loaded:
-            yield "Model not loaded. Please try reloading the space."
-            return
-        try:
-            # Move model to GPU for inference
-            self.device = torch.device("cuda")
-            self.model.to(self.device)
-            # Add user message to history
-            history.append({"role": "user", "content": user_input})
-            # Format messages
-            messages = self.format_chat_history(history, system_prompt)
-            # Apply chat template
-            inputs = self.tokenizer.apply_chat_template(
-                messages,
-                return_tensors="pt",
-                add_generation_prompt=True
-            ).to(self.device)
-            # Generate streaming response
-            generated_text = ""
-            for token_id in self.model.generate(
-                inputs,
-                max_new_tokens=MAX_NEW_TOKENS,
-                temperature=temperature,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id,
-                eos_token_id=self.tokenizer.eos_token_id,
-                streamer=None,
-            ):
-                # Decode current token
-                new_token = self.tokenizer.decode(token_id[-1:], skip_special_tokens=True)
-                generated_text += new_token
-                # Extract only the new response
-                response = generated_text
-                if response.startswith(messages[0]["content"]):
-                    response = response[len(messages[0]["content"]):].strip()
-                if user_input in response:
-                    response = response.replace(user_input, "").strip()
-                # Clean up common prefixes
-                prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
-                for prefix in prefixes_to_remove:
-                    if response.lower().startswith(prefix.lower()):
-                        response = response[len(prefix):].strip()
-                yield response
-                # Stop if we hit end of sentence
-                if new_token in ["</s>", "<|endoftext|>", "."] and len(response) > 50:
-                    break
-            # Add final response to history
-            history.append({"role": "assistant", "content": response})
-            # Move model back to CPU after inference to free GPU
-            self.model.to("cpu")
-            torch.cuda.empty_cache()
-        except Exception as e:
-            yield f"Error generating response: {str(e)}"
-# Initialize chat model (loads model once on startup)
 print("Initializing MobileLLM-Pro model...")
 chat_model = MobileLLMChat()
 def clear_chat():
-    """Clear the chat history"""
-    return [], []
 def chat_fn(message, history, system_prompt, temperature):
-    """Main chat function"""
     if not chat_model.model_loaded:
         return history + [[message, "Please wait for the model to load or reload the space."]]
-    # Convert history format for the model
     formatted_history = []
     for user_msg, assistant_msg in history:
         formatted_history.append({"role": "user", "content": user_msg})
         if assistant_msg:
             formatted_history.append({"role": "assistant", "content": assistant_msg})
-    # Generate response
     response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
-    # Return updated history with new message pair
     return history + [[message, response]]
 def chat_stream_fn(message, history, system_prompt, temperature):
-    """Streaming chat function"""
     if not chat_model.model_loaded:
-        yield "Please wait for the model to load or reload the space."
         return
-    # Convert history format
     formatted_history = []
     for user_msg, assistant_msg in history:
         formatted_history.append({"role": "user", "content": user_msg})
         if assistant_msg:
             formatted_history.append({"role": "assistant", "content": assistant_msg})
-    # Generate streaming response
-    for chunk in chat_model.generate_stream(message, formatted_history, system_prompt, temperature):
-        yield chunk
-# Create the Gradio interface
 with gr.Blocks(
     title="MobileLLM-Pro Chat",
     theme=gr.themes.Soft(),
     css="""
-    .gradio-container {
-        max-width: 900px !important;
-        margin: auto !important;
-    }
-    .message {
-        padding: 12px !important;
-        border-radius: 8px !important;
-        margin-bottom: 8px !important;
-    }
-    .user-message {
-        background-color: #e3f2fd !important;
-        margin-left: 20% !important;
-    }
-    .assistant-message {
-        background-color: #f5f5f5 !important;
-        margin-right: 20% !important;
-    }
     """
 ) as demo:
     # Header
-    gr.HTML("""
-    <div style="text-align: center; margin-bottom: 20px;">
-        <h1>🤖 MobileLLM-Pro Chat</h1>
-        <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
-        <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
-    </div>
-    """)
-    # Model status indicator
     with gr.Row():
         model_status = gr.Textbox(
             label="Model Status",
             value="Model loaded and ready!" if chat_model.model_loaded else "Model loading...",
             interactive=False,
-            container=True
         )
-    # Configuration section
     with gr.Accordion("⚙️ Configuration", open=False):
         with gr.Row():
             system_prompt = gr.Textbox(
                 value=DEFAULT_SYSTEM_PROMPT,
                 label="System Prompt",
                 lines=3,
-                info="Customize the AI's behavior and personality"
             )
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1,
@@ -312,56 +266,47 @@ with gr.Blocks(
                 value=0.7,
                 step=0.1,
                 label="Temperature",
-                info="Controls randomness (higher = more creative)"
             )
             streaming = gr.Checkbox(
                 value=True,
                 label="Enable Streaming",
-                info="Show responses as they're being generated"
             )
-    # Chat interface
     chatbot = gr.Chatbot(
         label="Chat History",
         height=500,
-        show_copy_button=True
     )
     with gr.Row():
         msg = gr.Textbox(
             label="Your Message",
             placeholder="Type your message here...",
             scale=4,
-            container=False
         )
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
-    # Handle chat submission
-    def handle_chat(message, history, system_prompt, temperature, streaming):
-        if streaming:
-            return chat_stream_fn(message, history, system_prompt, temperature)
-        else:
-            return chat_fn(message, history, system_prompt, temperature)
     msg.submit(
         handle_chat,
         inputs=[msg, chatbot, system_prompt, temperature, streaming],
-        outputs=[chatbot]
     )
     submit_btn.click(
         handle_chat,
         inputs=[msg, chatbot, system_prompt, temperature, streaming],
-        outputs=[chatbot]
     )
     clear_btn.click(
         clear_chat,
-        outputs=[chatbot, msg]
     )
     # Examples
     gr.Examples(
         examples=[
@@ -372,21 +317,25 @@ with gr.Blocks(
             ["How can I improve my productivity?"],
         ],
         inputs=[msg],
-        label="Example Prompts"
     )
     # Footer
-    gr.HTML("""
-    <div style="text-align: center; margin-top: 20px; color: #666;">
-        <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
-        <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
-    </div>
-    """)
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
-        share=True,
         show_error=True,
-        debug=True
-    )

+import os
+import time
+from typing import List, Dict
 import torch
+import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
 import spaces
+# =========================
 # Configuration
+# =========================
 MODEL_ID = "facebook/MobileLLM-Pro"
+MODEL_SUBFOLDER = "instruct"  # "base" | "instruct"
 MAX_HISTORY_LENGTH = 10
 MAX_NEW_TOKENS = 512
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a helpful, friendly, and intelligent assistant. "
+    "Provide clear, accurate, and thoughtful responses."
+)
+# =========================
+# HF Login (optional)
+# =========================
 HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN:
     try:
     except Exception as e:
         print(f"Warning: Could not login to Hugging Face: {e}")
+# =========================
+# Chat Model Wrapper
+# =========================
 class MobileLLMChat:
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
+        self.load_model(version=MODEL_SUBFOLDER)
     def load_model(self, version="instruct"):
+        """Load the MobileLLM-Pro model and tokenizer (initially to CPU)."""
         try:
+            print(f"Loading {MODEL_ID} ({version})...")
             self.tokenizer = AutoTokenizer.from_pretrained(
+                MODEL_ID, trust_remote_code=True, subfolder=version
             )
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
                 torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
             )
+            # Safety: ensure pad token exists (some LLMs don't set it)
+            if self.tokenizer.pad_token_id is None:
+                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
             self.model.eval()
             self.model_loaded = True
+            print("Model loaded successfully to system memory (CPU).")
             return True
         except Exception as e:
             print(f"Error loading model: {e}")
             return False
+    def format_chat_history(
+        self, history: List[Dict[str, str]], system_prompt: str
+    ) -> List[Dict[str, str]]:
+        """Format chat history for tokenizer's chat template."""
         messages = [{"role": "system", "content": system_prompt}]
+        # Truncate to keep the last N turns
+        trimmed = []
         for msg in history:
+            if msg["role"] in ("user", "assistant"):
+                trimmed.append(msg)
+        if MAX_HISTORY_LENGTH > 0:
+            trimmed = trimmed[-(MAX_HISTORY_LENGTH * 2) :]
+        messages.extend(trimmed)
         return messages
     @spaces.GPU(duration=120)
+    def generate_response(
+        self,
+        user_input: str,
+        history: List[Dict[str, str]],
+        system_prompt: str,
+        temperature: float = 0.7,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+    ) -> str:
+        """Generate a full response (GPU during inference)."""
         if not self.model_loaded:
             return "Model not loaded. Please try reloading the space."
         try:
+            # Choose device (Spaces GPU if available)
+            use_cuda = torch.cuda.is_available()
+            self.device = torch.device("cuda" if use_cuda else "cpu")
             self.model.to(self.device)
+            # Append the new user message
             history.append({"role": "user", "content": user_input})
             messages = self.format_chat_history(history, system_prompt)
+            # Build inputs with chat template
+            input_ids = self.tokenizer.apply_chat_template(
+                messages, return_tensors="pt", add_generation_prompt=True
             ).to(self.device)
+            # No padding used here -> full ones mask
+            attention_mask = torch.ones_like(input_ids)
             with torch.no_grad():
                 outputs = self.model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
                     max_new_tokens=max_new_tokens,
                     temperature=temperature,
                     do_sample=True,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
                 )
+            # Slice only the newly generated tokens
+            gen_ids = outputs[0][input_ids.shape[1] :]
+            response = self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+            # Update history (internal state for the caller if desired)
             history.append({"role": "assistant", "content": response})
+            # Free GPU VRAM
+            if use_cuda:
+                self.model.to("cpu")
+                torch.cuda.empty_cache()
             return response
         except Exception as e:
             return f"Error generating response: {str(e)}"
+# =========================
+# Initialize Chat Model
+# =========================
 print("Initializing MobileLLM-Pro model...")
 chat_model = MobileLLMChat()
+# =========================
+# Gradio Helpers
+# =========================
 def clear_chat():
+    """Clear the chat history and input box."""
+    return [], ""
 def chat_fn(message, history, system_prompt, temperature):
+    """Non-streaming chat handler (returns tuples)."""
     if not chat_model.model_loaded:
         return history + [[message, "Please wait for the model to load or reload the space."]]
+    # Convert tuples history -> list of role dicts
     formatted_history = []
     for user_msg, assistant_msg in history:
         formatted_history.append({"role": "user", "content": user_msg})
         if assistant_msg:
             formatted_history.append({"role": "assistant", "content": assistant_msg})
+    # Generate full response once
     response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
+    # Return updated tuples history
     return history + [[message, response]]
 def chat_stream_fn(message, history, system_prompt, temperature):
+    """Streaming chat handler (tuples): generate once, then chunk out."""
     if not chat_model.model_loaded:
+        yield history + [[message, "Please wait for the model to load or reload the space."]]
         return
+    # Convert tuples history -> list of role dicts
     formatted_history = []
     for user_msg, assistant_msg in history:
         formatted_history.append({"role": "user", "content": user_msg})
         if assistant_msg:
             formatted_history.append({"role": "assistant", "content": assistant_msg})
+    # Generate full response (GPU)
+    full_response = chat_model.generate_response(
+        message, formatted_history, system_prompt, temperature
+    )
+    # Start new row and progressively fill assistant side
+    base = history + [[message, ""]]
+    if not isinstance(full_response, str):
+        # In case of an error string (already str), we still stream it
+        full_response = str(full_response)
+    step = max(8, len(full_response) // 40)  # ~40 chunks
+    for i in range(0, len(full_response), step):
+        partial = full_response[: i + step]
+        yield base[:-1] + [[message, partial]]
+    # Final ensure complete
+    yield base[:-1] + [[message, full_response]]
+def handle_chat(message, history, system_prompt, temperature, streaming):
+    return (
+        chat_stream_fn(message, history, system_prompt, temperature)
+        if streaming
+        else chat_fn(message, history, system_prompt, temperature)
+    )
+# =========================
+# Gradio UI
+# =========================
 with gr.Blocks(
     title="MobileLLM-Pro Chat",
     theme=gr.themes.Soft(),
     css="""
+    .gradio-container { max-width: 900px !important; margin: auto !important; }
+    .message { padding: 12px !important; border-radius: 8px !important; margin-bottom: 8px !important; }
+    .user-message { background-color: #e3f2fd !important; margin-left: 20% !important; }
+    .assistant-message { background-color: #f5f5f5 !important; margin-right: 20% !important; }
     """
 ) as demo:
     # Header
+    gr.HTML(
+        """
+        <div style="text-align: center; margin-bottom: 20px;">
+            <h1>🤖 MobileLLM-Pro Chat</h1>
+            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
+            <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
+        </div>
+        """
+    )
+    # Model status
     with gr.Row():
         model_status = gr.Textbox(
             label="Model Status",
             value="Model loaded and ready!" if chat_model.model_loaded else "Model loading...",
             interactive=False,
+            container=True,
         )
+    # Config
     with gr.Accordion("⚙️ Configuration", open=False):
         with gr.Row():
             system_prompt = gr.Textbox(
                 value=DEFAULT_SYSTEM_PROMPT,
                 label="System Prompt",
                 lines=3,
+                info="Customize the AI's behavior and personality",
             )
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1,
                 value=0.7,
                 step=0.1,
                 label="Temperature",
+                info="Controls randomness (higher = more creative)",
             )
             streaming = gr.Checkbox(
                 value=True,
                 label="Enable Streaming",
+                info="Show responses as they're being generated",
             )
+    # Chatbot in TUPLES mode
     chatbot = gr.Chatbot(
         label="Chat History",
         height=500,
+        show_copy_button=True,
     )
     with gr.Row():
         msg = gr.Textbox(
             label="Your Message",
             placeholder="Type your message here...",
             scale=4,
+            container=False,
         )
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
+    # Wire events
     msg.submit(
         handle_chat,
         inputs=[msg, chatbot, system_prompt, temperature, streaming],
+        outputs=[chatbot],
     )
     submit_btn.click(
         handle_chat,
         inputs=[msg, chatbot, system_prompt, temperature, streaming],
+        outputs=[chatbot],
     )
     clear_btn.click(
         clear_chat,
+        outputs=[chatbot, msg],
     )
     # Examples
     gr.Examples(
         examples=[
             ["How can I improve my productivity?"],
         ],
         inputs=[msg],
+        label="Example Prompts",
     )
     # Footer
+    gr.HTML(
+        """
+        <div style="text-align: center; margin-top: 20px; color: #666;">
+            <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
+            <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
+        </div>
+        """
+    )
+# Optional: queue to improve streaming UX
+demo.queue()
+# Launch (NO share=True on Spaces)
 if __name__ == "__main__":
     demo.launch(
         show_error=True,
+        debug=True,
+    )