akhaliq HF Staff commited on
Commit
98da568
·
verified ·
1 Parent(s): 74cb54c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -233
app.py CHANGED
@@ -1,19 +1,28 @@
1
- import gradio as gr
 
 
 
2
  import torch
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from huggingface_hub import login
5
- import os
6
- from typing import List, Dict, Any
7
- import time
8
  import spaces
9
 
 
10
  # Configuration
 
11
  MODEL_ID = "facebook/MobileLLM-Pro"
 
12
  MAX_HISTORY_LENGTH = 10
13
  MAX_NEW_TOKENS = 512
14
- DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly, and intelligent assistant. Provide clear, accurate, and thoughtful responses."
 
 
 
15
 
16
- # Login to Hugging Face (if token is provided)
 
 
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
  if HF_TOKEN:
19
  try:
@@ -22,289 +31,234 @@ if HF_TOKEN:
22
  except Exception as e:
23
  print(f"Warning: Could not login to Hugging Face: {e}")
24
 
 
 
 
 
25
  class MobileLLMChat:
26
  def __init__(self):
27
  self.model = None
28
  self.tokenizer = None
29
  self.device = None
30
  self.model_loaded = False
31
- # Load model on initialization for shared app
32
- self.load_model()
33
-
34
  def load_model(self, version="instruct"):
35
- """Load the MobileLLM-Pro model and tokenizer - runs once on CPU/system memory"""
36
  try:
37
- print(f"Loading MobileLLM-Pro ({version})...")
38
-
39
- # Load tokenizer
40
  self.tokenizer = AutoTokenizer.from_pretrained(
41
- MODEL_ID,
42
- trust_remote_code=True,
43
- subfolder=version
44
  )
45
-
46
- # Load model to CPU first for shared app
47
  self.model = AutoModelForCausalLM.from_pretrained(
48
  MODEL_ID,
49
  trust_remote_code=True,
50
  subfolder=version,
51
  torch_dtype=torch.float16,
52
- low_cpu_mem_usage=True
53
  )
54
-
55
- # Model will be moved to GPU during inference
 
 
56
  self.model.eval()
57
  self.model_loaded = True
58
- print(f"Model loaded successfully in system memory")
59
  return True
60
-
61
  except Exception as e:
62
  print(f"Error loading model: {e}")
63
  return False
64
-
65
- def format_chat_history(self, history: List[Dict[str, str]], system_prompt: str) -> List[Dict[str, str]]:
66
- """Format chat history for the model"""
 
 
67
  messages = [{"role": "system", "content": system_prompt}]
68
-
 
69
  for msg in history:
70
- if msg["role"] in ["user", "assistant"]:
71
- messages.append(msg)
72
-
 
 
73
  return messages
74
-
75
  @spaces.GPU(duration=120)
76
- def generate_response(self, user_input: str, history: List[Dict[str, str]],
77
- system_prompt: str, temperature: float = 0.7,
78
- max_new_tokens: int = MAX_NEW_TOKENS) -> str:
79
- """Generate a response from the model - GPU allocated only during inference"""
 
 
 
 
 
80
  if not self.model_loaded:
81
  return "Model not loaded. Please try reloading the space."
82
-
83
  try:
84
- # Move model to GPU for inference
85
- self.device = torch.device("cuda")
 
86
  self.model.to(self.device)
87
-
88
- # Add user message to history
89
  history.append({"role": "user", "content": user_input})
90
-
91
- # Format messages
92
  messages = self.format_chat_history(history, system_prompt)
93
-
94
- # Apply chat template
95
- inputs = self.tokenizer.apply_chat_template(
96
- messages,
97
- return_tensors="pt",
98
- add_generation_prompt=True
99
  ).to(self.device)
100
-
101
- # Generate response
 
102
  with torch.no_grad():
103
  outputs = self.model.generate(
104
- inputs,
 
105
  max_new_tokens=max_new_tokens,
106
  temperature=temperature,
107
  do_sample=True,
108
  pad_token_id=self.tokenizer.eos_token_id,
109
  eos_token_id=self.tokenizer.eos_token_id,
110
  )
111
-
112
- # Decode response
113
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
114
-
115
- # Extract only the new response (remove input)
116
- if response.startswith(messages[0]["content"]):
117
- response = response[len(messages[0]["content"]):].strip()
118
-
119
- # Remove the user input from the response
120
- if user_input in response:
121
- response = response.replace(user_input, "").strip()
122
-
123
- # Clean up common prefixes
124
- prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
125
- for prefix in prefixes_to_remove:
126
- if response.lower().startswith(prefix.lower()):
127
- response = response[len(prefix):].strip()
128
-
129
- # Add assistant response to history
130
  history.append({"role": "assistant", "content": response})
131
-
132
- # Move model back to CPU after inference to free GPU
133
- self.model.to("cpu")
134
- torch.cuda.empty_cache()
135
-
 
136
  return response
137
-
138
  except Exception as e:
139
  return f"Error generating response: {str(e)}"
140
-
141
- @spaces.GPU(duration=120)
142
- def generate_stream(self, user_input: str, history: List[Dict[str, str]],
143
- system_prompt: str, temperature: float = 0.7):
144
- """Generate a streaming response from the model - GPU allocated only during inference"""
145
- if not self.model_loaded:
146
- yield "Model not loaded. Please try reloading the space."
147
- return
148
-
149
- try:
150
- # Move model to GPU for inference
151
- self.device = torch.device("cuda")
152
- self.model.to(self.device)
153
-
154
- # Add user message to history
155
- history.append({"role": "user", "content": user_input})
156
-
157
- # Format messages
158
- messages = self.format_chat_history(history, system_prompt)
159
-
160
- # Apply chat template
161
- inputs = self.tokenizer.apply_chat_template(
162
- messages,
163
- return_tensors="pt",
164
- add_generation_prompt=True
165
- ).to(self.device)
166
-
167
- # Generate streaming response
168
- generated_text = ""
169
- for token_id in self.model.generate(
170
- inputs,
171
- max_new_tokens=MAX_NEW_TOKENS,
172
- temperature=temperature,
173
- do_sample=True,
174
- pad_token_id=self.tokenizer.eos_token_id,
175
- eos_token_id=self.tokenizer.eos_token_id,
176
- streamer=None,
177
- ):
178
- # Decode current token
179
- new_token = self.tokenizer.decode(token_id[-1:], skip_special_tokens=True)
180
- generated_text += new_token
181
-
182
- # Extract only the new response
183
- response = generated_text
184
- if response.startswith(messages[0]["content"]):
185
- response = response[len(messages[0]["content"]):].strip()
186
-
187
- if user_input in response:
188
- response = response.replace(user_input, "").strip()
189
-
190
- # Clean up common prefixes
191
- prefixes_to_remove = ["Assistant:", "assistant:", "Response:", "response:"]
192
- for prefix in prefixes_to_remove:
193
- if response.lower().startswith(prefix.lower()):
194
- response = response[len(prefix):].strip()
195
-
196
- yield response
197
-
198
- # Stop if we hit end of sentence
199
- if new_token in ["</s>", "<|endoftext|>", "."] and len(response) > 50:
200
- break
201
-
202
- # Add final response to history
203
- history.append({"role": "assistant", "content": response})
204
-
205
- # Move model back to CPU after inference to free GPU
206
- self.model.to("cpu")
207
- torch.cuda.empty_cache()
208
-
209
- except Exception as e:
210
- yield f"Error generating response: {str(e)}"
211
 
212
- # Initialize chat model (loads model once on startup)
 
 
 
213
  print("Initializing MobileLLM-Pro model...")
214
  chat_model = MobileLLMChat()
215
 
 
 
 
 
216
  def clear_chat():
217
- """Clear the chat history"""
218
- return [], []
 
219
 
220
  def chat_fn(message, history, system_prompt, temperature):
221
- """Main chat function"""
222
  if not chat_model.model_loaded:
223
  return history + [[message, "Please wait for the model to load or reload the space."]]
224
-
225
- # Convert history format for the model
226
  formatted_history = []
227
  for user_msg, assistant_msg in history:
228
  formatted_history.append({"role": "user", "content": user_msg})
229
  if assistant_msg:
230
  formatted_history.append({"role": "assistant", "content": assistant_msg})
231
-
232
- # Generate response
233
  response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
234
-
235
- # Return updated history with new message pair
236
  return history + [[message, response]]
237
 
 
238
  def chat_stream_fn(message, history, system_prompt, temperature):
239
- """Streaming chat function"""
240
  if not chat_model.model_loaded:
241
- yield "Please wait for the model to load or reload the space."
242
  return
243
-
244
- # Convert history format
245
  formatted_history = []
246
  for user_msg, assistant_msg in history:
247
  formatted_history.append({"role": "user", "content": user_msg})
248
  if assistant_msg:
249
  formatted_history.append({"role": "assistant", "content": assistant_msg})
250
-
251
- # Generate streaming response
252
- for chunk in chat_model.generate_stream(message, formatted_history, system_prompt, temperature):
253
- yield chunk
254
 
255
- # Create the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  with gr.Blocks(
257
  title="MobileLLM-Pro Chat",
258
  theme=gr.themes.Soft(),
259
  css="""
260
- .gradio-container {
261
- max-width: 900px !important;
262
- margin: auto !important;
263
- }
264
- .message {
265
- padding: 12px !important;
266
- border-radius: 8px !important;
267
- margin-bottom: 8px !important;
268
- }
269
- .user-message {
270
- background-color: #e3f2fd !important;
271
- margin-left: 20% !important;
272
- }
273
- .assistant-message {
274
- background-color: #f5f5f5 !important;
275
- margin-right: 20% !important;
276
- }
277
  """
278
  ) as demo:
279
-
280
  # Header
281
- gr.HTML("""
282
- <div style="text-align: center; margin-bottom: 20px;">
283
- <h1>🤖 MobileLLM-Pro Chat</h1>
284
- <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
285
- <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
286
- </div>
287
- """)
288
-
289
- # Model status indicator
 
 
290
  with gr.Row():
291
  model_status = gr.Textbox(
292
  label="Model Status",
293
  value="Model loaded and ready!" if chat_model.model_loaded else "Model loading...",
294
  interactive=False,
295
- container=True
296
  )
297
-
298
- # Configuration section
299
  with gr.Accordion("⚙️ Configuration", open=False):
300
  with gr.Row():
301
  system_prompt = gr.Textbox(
302
  value=DEFAULT_SYSTEM_PROMPT,
303
  label="System Prompt",
304
  lines=3,
305
- info="Customize the AI's behavior and personality"
306
  )
307
-
308
  with gr.Row():
309
  temperature = gr.Slider(
310
  minimum=0.1,
@@ -312,56 +266,47 @@ with gr.Blocks(
312
  value=0.7,
313
  step=0.1,
314
  label="Temperature",
315
- info="Controls randomness (higher = more creative)"
316
  )
317
-
318
  streaming = gr.Checkbox(
319
  value=True,
320
  label="Enable Streaming",
321
- info="Show responses as they're being generated"
322
  )
323
-
324
- # Chat interface
325
  chatbot = gr.Chatbot(
326
  label="Chat History",
327
  height=500,
328
- show_copy_button=True
329
  )
330
-
331
  with gr.Row():
332
  msg = gr.Textbox(
333
  label="Your Message",
334
  placeholder="Type your message here...",
335
  scale=4,
336
- container=False
337
  )
338
  submit_btn = gr.Button("Send", variant="primary", scale=1)
339
  clear_btn = gr.Button("Clear", scale=0)
340
-
341
- # Handle chat submission
342
- def handle_chat(message, history, system_prompt, temperature, streaming):
343
- if streaming:
344
- return chat_stream_fn(message, history, system_prompt, temperature)
345
- else:
346
- return chat_fn(message, history, system_prompt, temperature)
347
-
348
  msg.submit(
349
  handle_chat,
350
  inputs=[msg, chatbot, system_prompt, temperature, streaming],
351
- outputs=[chatbot]
352
  )
353
-
354
  submit_btn.click(
355
  handle_chat,
356
  inputs=[msg, chatbot, system_prompt, temperature, streaming],
357
- outputs=[chatbot]
358
  )
359
-
360
  clear_btn.click(
361
  clear_chat,
362
- outputs=[chatbot, msg]
363
  )
364
-
365
  # Examples
366
  gr.Examples(
367
  examples=[
@@ -372,21 +317,25 @@ with gr.Blocks(
372
  ["How can I improve my productivity?"],
373
  ],
374
  inputs=[msg],
375
- label="Example Prompts"
376
  )
377
-
378
  # Footer
379
- gr.HTML("""
380
- <div style="text-align: center; margin-top: 20px; color: #666;">
381
- <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
382
- <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
383
- </div>
384
- """)
385
-
386
- # Launch the app
 
 
 
 
 
387
  if __name__ == "__main__":
388
  demo.launch(
389
- share=True,
390
  show_error=True,
391
- debug=True
392
- )
 
1
+ import os
2
+ import time
3
+ from typing import List, Dict
4
+
5
  import torch
6
+ import gradio as gr
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
8
  from huggingface_hub import login
 
 
 
9
  import spaces
10
 
11
+ # =========================
12
  # Configuration
13
+ # =========================
14
  MODEL_ID = "facebook/MobileLLM-Pro"
15
+ MODEL_SUBFOLDER = "instruct" # "base" | "instruct"
16
  MAX_HISTORY_LENGTH = 10
17
  MAX_NEW_TOKENS = 512
18
+ DEFAULT_SYSTEM_PROMPT = (
19
+ "You are a helpful, friendly, and intelligent assistant. "
20
+ "Provide clear, accurate, and thoughtful responses."
21
+ )
22
 
23
+ # =========================
24
+ # HF Login (optional)
25
+ # =========================
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
  if HF_TOKEN:
28
  try:
 
31
  except Exception as e:
32
  print(f"Warning: Could not login to Hugging Face: {e}")
33
 
34
+
35
+ # =========================
36
+ # Chat Model Wrapper
37
+ # =========================
38
  class MobileLLMChat:
39
  def __init__(self):
40
  self.model = None
41
  self.tokenizer = None
42
  self.device = None
43
  self.model_loaded = False
44
+ self.load_model(version=MODEL_SUBFOLDER)
45
+
 
46
  def load_model(self, version="instruct"):
47
+ """Load the MobileLLM-Pro model and tokenizer (initially to CPU)."""
48
  try:
49
+ print(f"Loading {MODEL_ID} ({version})...")
 
 
50
  self.tokenizer = AutoTokenizer.from_pretrained(
51
+ MODEL_ID, trust_remote_code=True, subfolder=version
 
 
52
  )
 
 
53
  self.model = AutoModelForCausalLM.from_pretrained(
54
  MODEL_ID,
55
  trust_remote_code=True,
56
  subfolder=version,
57
  torch_dtype=torch.float16,
58
+ low_cpu_mem_usage=True,
59
  )
60
+ # Safety: ensure pad token exists (some LLMs don't set it)
61
+ if self.tokenizer.pad_token_id is None:
62
+ self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
63
+
64
  self.model.eval()
65
  self.model_loaded = True
66
+ print("Model loaded successfully to system memory (CPU).")
67
  return True
 
68
  except Exception as e:
69
  print(f"Error loading model: {e}")
70
  return False
71
+
72
+ def format_chat_history(
73
+ self, history: List[Dict[str, str]], system_prompt: str
74
+ ) -> List[Dict[str, str]]:
75
+ """Format chat history for tokenizer's chat template."""
76
  messages = [{"role": "system", "content": system_prompt}]
77
+ # Truncate to keep the last N turns
78
+ trimmed = []
79
  for msg in history:
80
+ if msg["role"] in ("user", "assistant"):
81
+ trimmed.append(msg)
82
+ if MAX_HISTORY_LENGTH > 0:
83
+ trimmed = trimmed[-(MAX_HISTORY_LENGTH * 2) :]
84
+ messages.extend(trimmed)
85
  return messages
86
+
87
  @spaces.GPU(duration=120)
88
+ def generate_response(
89
+ self,
90
+ user_input: str,
91
+ history: List[Dict[str, str]],
92
+ system_prompt: str,
93
+ temperature: float = 0.7,
94
+ max_new_tokens: int = MAX_NEW_TOKENS,
95
+ ) -> str:
96
+ """Generate a full response (GPU during inference)."""
97
  if not self.model_loaded:
98
  return "Model not loaded. Please try reloading the space."
 
99
  try:
100
+ # Choose device (Spaces GPU if available)
101
+ use_cuda = torch.cuda.is_available()
102
+ self.device = torch.device("cuda" if use_cuda else "cpu")
103
  self.model.to(self.device)
104
+
105
+ # Append the new user message
106
  history.append({"role": "user", "content": user_input})
 
 
107
  messages = self.format_chat_history(history, system_prompt)
108
+
109
+ # Build inputs with chat template
110
+ input_ids = self.tokenizer.apply_chat_template(
111
+ messages, return_tensors="pt", add_generation_prompt=True
 
 
112
  ).to(self.device)
113
+ # No padding used here -> full ones mask
114
+ attention_mask = torch.ones_like(input_ids)
115
+
116
  with torch.no_grad():
117
  outputs = self.model.generate(
118
+ input_ids,
119
+ attention_mask=attention_mask,
120
  max_new_tokens=max_new_tokens,
121
  temperature=temperature,
122
  do_sample=True,
123
  pad_token_id=self.tokenizer.eos_token_id,
124
  eos_token_id=self.tokenizer.eos_token_id,
125
  )
126
+
127
+ # Slice only the newly generated tokens
128
+ gen_ids = outputs[0][input_ids.shape[1] :]
129
+ response = self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
130
+
131
+ # Update history (internal state for the caller if desired)
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  history.append({"role": "assistant", "content": response})
133
+
134
+ # Free GPU VRAM
135
+ if use_cuda:
136
+ self.model.to("cpu")
137
+ torch.cuda.empty_cache()
138
+
139
  return response
 
140
  except Exception as e:
141
  return f"Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+
144
+ # =========================
145
+ # Initialize Chat Model
146
+ # =========================
147
  print("Initializing MobileLLM-Pro model...")
148
  chat_model = MobileLLMChat()
149
 
150
+
151
+ # =========================
152
+ # Gradio Helpers
153
+ # =========================
154
  def clear_chat():
155
+ """Clear the chat history and input box."""
156
+ return [], ""
157
+
158
 
159
  def chat_fn(message, history, system_prompt, temperature):
160
+ """Non-streaming chat handler (returns tuples)."""
161
  if not chat_model.model_loaded:
162
  return history + [[message, "Please wait for the model to load or reload the space."]]
163
+
164
+ # Convert tuples history -> list of role dicts
165
  formatted_history = []
166
  for user_msg, assistant_msg in history:
167
  formatted_history.append({"role": "user", "content": user_msg})
168
  if assistant_msg:
169
  formatted_history.append({"role": "assistant", "content": assistant_msg})
170
+
171
+ # Generate full response once
172
  response = chat_model.generate_response(message, formatted_history, system_prompt, temperature)
173
+
174
+ # Return updated tuples history
175
  return history + [[message, response]]
176
 
177
+
178
  def chat_stream_fn(message, history, system_prompt, temperature):
179
+ """Streaming chat handler (tuples): generate once, then chunk out."""
180
  if not chat_model.model_loaded:
181
+ yield history + [[message, "Please wait for the model to load or reload the space."]]
182
  return
183
+
184
+ # Convert tuples history -> list of role dicts
185
  formatted_history = []
186
  for user_msg, assistant_msg in history:
187
  formatted_history.append({"role": "user", "content": user_msg})
188
  if assistant_msg:
189
  formatted_history.append({"role": "assistant", "content": assistant_msg})
 
 
 
 
190
 
191
+ # Generate full response (GPU)
192
+ full_response = chat_model.generate_response(
193
+ message, formatted_history, system_prompt, temperature
194
+ )
195
+
196
+ # Start new row and progressively fill assistant side
197
+ base = history + [[message, ""]]
198
+ if not isinstance(full_response, str):
199
+ # In case of an error string (already str), we still stream it
200
+ full_response = str(full_response)
201
+
202
+ step = max(8, len(full_response) // 40) # ~40 chunks
203
+ for i in range(0, len(full_response), step):
204
+ partial = full_response[: i + step]
205
+ yield base[:-1] + [[message, partial]]
206
+
207
+ # Final ensure complete
208
+ yield base[:-1] + [[message, full_response]]
209
+
210
+
211
+ def handle_chat(message, history, system_prompt, temperature, streaming):
212
+ return (
213
+ chat_stream_fn(message, history, system_prompt, temperature)
214
+ if streaming
215
+ else chat_fn(message, history, system_prompt, temperature)
216
+ )
217
+
218
+
219
+ # =========================
220
+ # Gradio UI
221
+ # =========================
222
  with gr.Blocks(
223
  title="MobileLLM-Pro Chat",
224
  theme=gr.themes.Soft(),
225
  css="""
226
+ .gradio-container { max-width: 900px !important; margin: auto !important; }
227
+ .message { padding: 12px !important; border-radius: 8px !important; margin-bottom: 8px !important; }
228
+ .user-message { background-color: #e3f2fd !important; margin-left: 20% !important; }
229
+ .assistant-message { background-color: #f5f5f5 !important; margin-right: 20% !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  """
231
  ) as demo:
232
+
233
  # Header
234
+ gr.HTML(
235
+ """
236
+ <div style="text-align: center; margin-bottom: 20px;">
237
+ <h1>🤖 MobileLLM-Pro Chat</h1>
238
+ <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a></p>
239
+ <p>Chat with Facebook's MobileLLM-Pro model optimized for on-device inference</p>
240
+ </div>
241
+ """
242
+ )
243
+
244
+ # Model status
245
  with gr.Row():
246
  model_status = gr.Textbox(
247
  label="Model Status",
248
  value="Model loaded and ready!" if chat_model.model_loaded else "Model loading...",
249
  interactive=False,
250
+ container=True,
251
  )
252
+
253
+ # Config
254
  with gr.Accordion("⚙️ Configuration", open=False):
255
  with gr.Row():
256
  system_prompt = gr.Textbox(
257
  value=DEFAULT_SYSTEM_PROMPT,
258
  label="System Prompt",
259
  lines=3,
260
+ info="Customize the AI's behavior and personality",
261
  )
 
262
  with gr.Row():
263
  temperature = gr.Slider(
264
  minimum=0.1,
 
266
  value=0.7,
267
  step=0.1,
268
  label="Temperature",
269
+ info="Controls randomness (higher = more creative)",
270
  )
 
271
  streaming = gr.Checkbox(
272
  value=True,
273
  label="Enable Streaming",
274
+ info="Show responses as they're being generated",
275
  )
276
+
277
+ # Chatbot in TUPLES mode
278
  chatbot = gr.Chatbot(
279
  label="Chat History",
280
  height=500,
281
+ show_copy_button=True,
282
  )
283
+
284
  with gr.Row():
285
  msg = gr.Textbox(
286
  label="Your Message",
287
  placeholder="Type your message here...",
288
  scale=4,
289
+ container=False,
290
  )
291
  submit_btn = gr.Button("Send", variant="primary", scale=1)
292
  clear_btn = gr.Button("Clear", scale=0)
293
+
294
+ # Wire events
 
 
 
 
 
 
295
  msg.submit(
296
  handle_chat,
297
  inputs=[msg, chatbot, system_prompt, temperature, streaming],
298
+ outputs=[chatbot],
299
  )
 
300
  submit_btn.click(
301
  handle_chat,
302
  inputs=[msg, chatbot, system_prompt, temperature, streaming],
303
+ outputs=[chatbot],
304
  )
 
305
  clear_btn.click(
306
  clear_chat,
307
+ outputs=[chatbot, msg],
308
  )
309
+
310
  # Examples
311
  gr.Examples(
312
  examples=[
 
317
  ["How can I improve my productivity?"],
318
  ],
319
  inputs=[msg],
320
+ label="Example Prompts",
321
  )
322
+
323
  # Footer
324
+ gr.HTML(
325
+ """
326
+ <div style="text-align: center; margin-top: 20px; color: #666;">
327
+ <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
328
+ <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
329
+ </div>
330
+ """
331
+ )
332
+
333
+ # Optional: queue to improve streaming UX
334
+ demo.queue()
335
+
336
+ # Launch (NO share=True on Spaces)
337
  if __name__ == "__main__":
338
  demo.launch(
 
339
  show_error=True,
340
+ debug=True,
341
+ )