Spaces:
Sleeping
Sleeping
use proper models
Browse files- __pycache__/app.cpython-312.pyc +0 -0
- app.py +100 -26
__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (28.3 kB). View file
|
|
|
app.py
CHANGED
|
@@ -40,6 +40,12 @@ cancel_event = threading.Event()
|
|
| 40 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 41 |
# ------------------------------
|
| 42 |
MODELS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# β¦ your existing entries β¦
|
| 44 |
"gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
|
| 45 |
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
|
@@ -51,9 +57,9 @@ MODELS = {
|
|
| 51 |
"repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
|
| 52 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
| 53 |
},
|
| 54 |
-
"gemma-
|
| 55 |
-
"repo_id":"google/gemma-
|
| 56 |
-
"description":"Gemma
|
| 57 |
},
|
| 58 |
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
| 59 |
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
|
@@ -143,6 +149,33 @@ MODELS = {
|
|
| 143 |
# Global cache for pipelines to avoid re-loading.
|
| 144 |
PIPELINES = {}
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def load_pipeline(model_name):
|
| 147 |
"""
|
| 148 |
Load and cache a transformers pipeline for text generation.
|
|
@@ -159,6 +192,16 @@ def load_pipeline(model_name):
|
|
| 159 |
repo = MODELS[model_name]["repo_id"]
|
| 160 |
logger.info(f"π¦ Repository: {repo}")
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
# Load tokenizer
|
| 163 |
logger.info(f"π€ Loading tokenizer for {repo}...")
|
| 164 |
try:
|
|
@@ -166,8 +209,15 @@ def load_pipeline(model_name):
|
|
| 166 |
token=access_token if access_token else None)
|
| 167 |
logger.info(f"β
Tokenizer loaded successfully")
|
| 168 |
except Exception as e:
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Try different data types for optimal performance
|
| 173 |
dtypes_to_try = [
|
|
@@ -195,8 +245,14 @@ def load_pipeline(model_name):
|
|
| 195 |
return pipe
|
| 196 |
|
| 197 |
except Exception as e:
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
# Final fallback without specific dtype
|
| 202 |
logger.warning(f"π Attempting final fallback load without specific dtype...")
|
|
@@ -243,13 +299,21 @@ def retrieve_context(query, max_results=6, max_chars=600):
|
|
| 243 |
return []
|
| 244 |
|
| 245 |
def format_conversation(history, system_prompt, tokenizer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
| 247 |
-
messages = [{"role": "system", "content": system_prompt.strip()}] + history
|
| 248 |
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
|
| 249 |
else:
|
| 250 |
# Fallback for base LMs without chat template
|
| 251 |
prompt = system_prompt.strip() + "\n"
|
| 252 |
-
for msg in
|
| 253 |
if msg['role'] == 'user':
|
| 254 |
prompt += "User: " + msg['content'].strip() + "\n"
|
| 255 |
elif msg['role'] == 'assistant':
|
|
@@ -273,9 +337,18 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 273 |
logger.info(f"π Web search enabled: {enable_search}")
|
| 274 |
logger.info(f"βοΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
cancel_event.clear()
|
| 277 |
history = list(chat_history or [])
|
| 278 |
-
history.append(
|
| 279 |
logger.info(f"π Chat history length: {len(history)} messages")
|
| 280 |
|
| 281 |
# Launch web search if enabled
|
|
@@ -404,25 +477,21 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 404 |
logger.info("π Detected thinking block start")
|
| 405 |
in_thought = True
|
| 406 |
# Insert thought placeholder
|
| 407 |
-
history.append(
|
| 408 |
-
'role': 'assistant',
|
| 409 |
-
'content': '',
|
| 410 |
-
'metadata': {'title': 'π Thought'}
|
| 411 |
-
})
|
| 412 |
# Capture after opening tag
|
| 413 |
after = text.split('<think>', 1)[1]
|
| 414 |
thought_buf += after
|
| 415 |
# If closing tag in same chunk
|
| 416 |
if '</think>' in thought_buf:
|
| 417 |
before, after2 = thought_buf.split('</think>', 1)
|
| 418 |
-
history[-1]
|
| 419 |
in_thought = False
|
| 420 |
logger.info("π Thinking block completed, starting answer")
|
| 421 |
# Start answer buffer
|
| 422 |
answer_buf = after2
|
| 423 |
-
history.append(
|
| 424 |
else:
|
| 425 |
-
history[-1]
|
| 426 |
yield history, debug
|
| 427 |
continue
|
| 428 |
|
|
@@ -431,23 +500,23 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 431 |
thought_buf += text
|
| 432 |
if '</think>' in thought_buf:
|
| 433 |
before, after2 = thought_buf.split('</think>', 1)
|
| 434 |
-
history[-1]
|
| 435 |
in_thought = False
|
| 436 |
logger.info("π Thinking block completed, starting answer")
|
| 437 |
# Start answer buffer
|
| 438 |
answer_buf = after2
|
| 439 |
-
history.append(
|
| 440 |
else:
|
| 441 |
-
history[-1]
|
| 442 |
yield history, debug
|
| 443 |
continue
|
| 444 |
|
| 445 |
# Stream answer
|
| 446 |
if not answer_buf:
|
| 447 |
logger.info("π Starting answer generation")
|
| 448 |
-
history.append(
|
| 449 |
answer_buf += text
|
| 450 |
-
history[-1]
|
| 451 |
yield history, debug
|
| 452 |
|
| 453 |
gen_thread.join()
|
|
@@ -455,7 +524,7 @@ def chat_response(user_msg, chat_history, system_prompt,
|
|
| 455 |
yield history, debug + prompt_debug
|
| 456 |
except Exception as e:
|
| 457 |
logger.error(f"β Error during generation: {e}")
|
| 458 |
-
history.append(
|
| 459 |
yield history, debug
|
| 460 |
finally:
|
| 461 |
logger.info("π§Ή Cleaning up memory...")
|
|
@@ -478,6 +547,7 @@ def update_default_prompt(enable_search):
|
|
| 478 |
with gr.Blocks(title="LLM Inference") as demo:
|
| 479 |
gr.Markdown("## π§ LLM Inference with Web Search")
|
| 480 |
gr.Markdown("Interact with the model. Select parameters and chat below.")
|
|
|
|
| 481 |
with gr.Row():
|
| 482 |
with gr.Column(scale=3):
|
| 483 |
model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
|
|
@@ -496,7 +566,7 @@ with gr.Blocks(title="LLM Inference") as demo:
|
|
| 496 |
clr = gr.Button("Clear Chat")
|
| 497 |
cnl = gr.Button("Cancel Generation")
|
| 498 |
with gr.Column(scale=7):
|
| 499 |
-
chat = gr.Chatbot(
|
| 500 |
txt = gr.Textbox(placeholder="Type your message and press Enter...")
|
| 501 |
dbg = gr.Markdown()
|
| 502 |
|
|
@@ -508,4 +578,8 @@ with gr.Blocks(title="LLM Inference") as demo:
|
|
| 508 |
model_dd, max_tok, temp, k, p, rp, st],
|
| 509 |
outputs=[chat, dbg])
|
| 510 |
logger.info("π Starting Gradio application...")
|
| 511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 41 |
# ------------------------------
|
| 42 |
MODELS = {
|
| 43 |
+
# Accessible models (no gating required)
|
| 44 |
+
"Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct - accessible and reliable"},
|
| 45 |
+
"Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct - accessible and reliable"},
|
| 46 |
+
"microsoft-DialoGPT-medium": {"repo_id": "microsoft/DialoGPT-medium", "description": "Microsoft DialoGPT Medium - accessible conversational model"},
|
| 47 |
+
"microsoft-DialoGPT-large": {"repo_id": "microsoft/DialoGPT-large", "description": "Microsoft DialoGPT Large - accessible conversational model"},
|
| 48 |
+
|
| 49 |
# β¦ your existing entries β¦
|
| 50 |
"gpt-oss-20b": {"repo_id": "openai/gpt-oss-20b", "description": "openai/gpt-oss-20b"},
|
| 51 |
"Qwen2.5-Taiwan-1.5B-Instruct": {"repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct", "description": "Qwen2.5-Taiwan-1.5B-Instruct"},
|
|
|
|
| 57 |
"repo_id":"lianghsun/Gemma-3-Taiwan-270M-it",
|
| 58 |
"description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
|
| 59 |
},
|
| 60 |
+
"gemma-2-2b-it":{
|
| 61 |
+
"repo_id":"google/gemma-2-2b-it",
|
| 62 |
+
"description":"Gemma 2 2B Instruction-Tuned model - accessible alternative to Gemma 3",
|
| 63 |
},
|
| 64 |
"SmolLM-135M-Taiwan-Instruct-v1.0": {
|
| 65 |
"repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
|
|
|
|
| 149 |
# Global cache for pipelines to avoid re-loading.
|
| 150 |
PIPELINES = {}
|
| 151 |
|
| 152 |
+
def check_model_accessibility(repo_id, token=None):
|
| 153 |
+
"""
|
| 154 |
+
Check if a model is accessible without actually loading it.
|
| 155 |
+
Returns True if accessible, False if gated, raises exception for other errors.
|
| 156 |
+
"""
|
| 157 |
+
try:
|
| 158 |
+
from huggingface_hub import HfApi
|
| 159 |
+
api = HfApi(token=token)
|
| 160 |
+
model_info = api.model_info(repo_id)
|
| 161 |
+
|
| 162 |
+
# Check if model is gated
|
| 163 |
+
if hasattr(model_info, 'gated') and model_info.gated:
|
| 164 |
+
logger.warning(f"β οΈ Model {repo_id} is gated and requires special access")
|
| 165 |
+
return False
|
| 166 |
+
|
| 167 |
+
logger.info(f"β
Model {repo_id} is accessible")
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
error_msg = str(e)
|
| 172 |
+
if "gated" in error_msg.lower() or "401" in error_msg or "access" in error_msg.lower():
|
| 173 |
+
logger.warning(f"β οΈ Model {repo_id} appears to be gated or requires access")
|
| 174 |
+
return False
|
| 175 |
+
else:
|
| 176 |
+
logger.error(f"β Error checking model accessibility: {e}")
|
| 177 |
+
raise
|
| 178 |
+
|
| 179 |
def load_pipeline(model_name):
|
| 180 |
"""
|
| 181 |
Load and cache a transformers pipeline for text generation.
|
|
|
|
| 192 |
repo = MODELS[model_name]["repo_id"]
|
| 193 |
logger.info(f"π¦ Repository: {repo}")
|
| 194 |
|
| 195 |
+
# Check model accessibility first
|
| 196 |
+
try:
|
| 197 |
+
if not check_model_accessibility(repo, access_token):
|
| 198 |
+
raise Exception(f"Model {repo} is gated and requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
| 199 |
+
except Exception as e:
|
| 200 |
+
if "gated" in str(e).lower() or "access" in str(e).lower():
|
| 201 |
+
raise
|
| 202 |
+
else:
|
| 203 |
+
logger.warning(f"β οΈ Could not check model accessibility, proceeding with load attempt: {e}")
|
| 204 |
+
|
| 205 |
# Load tokenizer
|
| 206 |
logger.info(f"π€ Loading tokenizer for {repo}...")
|
| 207 |
try:
|
|
|
|
| 209 |
token=access_token if access_token else None)
|
| 210 |
logger.info(f"β
Tokenizer loaded successfully")
|
| 211 |
except Exception as e:
|
| 212 |
+
error_msg = str(e)
|
| 213 |
+
if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
|
| 214 |
+
logger.error(f"β Model {repo} is gated and requires special access permissions")
|
| 215 |
+
logger.error(f"π‘ Please visit https://huggingface.co/{repo} to request access")
|
| 216 |
+
logger.error(f"π‘ Or try a different model from the list")
|
| 217 |
+
raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
| 218 |
+
else:
|
| 219 |
+
logger.error(f"β Failed to load tokenizer: {e}")
|
| 220 |
+
raise
|
| 221 |
|
| 222 |
# Try different data types for optimal performance
|
| 223 |
dtypes_to_try = [
|
|
|
|
| 245 |
return pipe
|
| 246 |
|
| 247 |
except Exception as e:
|
| 248 |
+
error_msg = str(e)
|
| 249 |
+
if "gated repo" in error_msg or "401" in error_msg or "Access to model" in error_msg:
|
| 250 |
+
logger.error(f"β Model {repo} is gated and requires special access permissions")
|
| 251 |
+
logger.error(f"π‘ Please visit https://huggingface.co/{repo} to request access")
|
| 252 |
+
raise Exception(f"Model {repo} requires special access. Please request access at https://huggingface.co/{repo} or choose a different model.")
|
| 253 |
+
else:
|
| 254 |
+
logger.warning(f"β οΈ Failed to load with {dtype_desc}: {e}")
|
| 255 |
+
continue
|
| 256 |
|
| 257 |
# Final fallback without specific dtype
|
| 258 |
logger.warning(f"π Attempting final fallback load without specific dtype...")
|
|
|
|
| 299 |
return []
|
| 300 |
|
| 301 |
def format_conversation(history, system_prompt, tokenizer):
|
| 302 |
+
# Convert Gradio tuple format to message format for tokenizer
|
| 303 |
+
messages = [{"role": "system", "content": system_prompt.strip()}]
|
| 304 |
+
|
| 305 |
+
for user_msg, bot_msg in history:
|
| 306 |
+
if user_msg: # Add user message
|
| 307 |
+
messages.append({"role": "user", "content": user_msg})
|
| 308 |
+
if bot_msg: # Add bot message
|
| 309 |
+
messages.append({"role": "assistant", "content": bot_msg})
|
| 310 |
+
|
| 311 |
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
|
|
|
|
| 312 |
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
|
| 313 |
else:
|
| 314 |
# Fallback for base LMs without chat template
|
| 315 |
prompt = system_prompt.strip() + "\n"
|
| 316 |
+
for msg in messages[1:]: # Skip system message
|
| 317 |
if msg['role'] == 'user':
|
| 318 |
prompt += "User: " + msg['content'].strip() + "\n"
|
| 319 |
elif msg['role'] == 'assistant':
|
|
|
|
| 337 |
logger.info(f"π Web search enabled: {enable_search}")
|
| 338 |
logger.info(f"βοΈ Generation params: max_tokens={max_tokens}, temp={temperature}, top_k={top_k}, top_p={top_p}")
|
| 339 |
|
| 340 |
+
# Validate inputs
|
| 341 |
+
if not user_msg or not user_msg.strip():
|
| 342 |
+
logger.error("β Empty user message received")
|
| 343 |
+
return [], "Error: Empty message received"
|
| 344 |
+
|
| 345 |
+
if model_name not in MODELS:
|
| 346 |
+
logger.error(f"β Invalid model name: {model_name}")
|
| 347 |
+
return [], f"Error: Invalid model '{model_name}'"
|
| 348 |
+
|
| 349 |
cancel_event.clear()
|
| 350 |
history = list(chat_history or [])
|
| 351 |
+
history.append((user_msg, None)) # Add user message, bot response will be added later
|
| 352 |
logger.info(f"π Chat history length: {len(history)} messages")
|
| 353 |
|
| 354 |
# Launch web search if enabled
|
|
|
|
| 477 |
logger.info("π Detected thinking block start")
|
| 478 |
in_thought = True
|
| 479 |
# Insert thought placeholder
|
| 480 |
+
history.append((None, "π Thinking..."))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
# Capture after opening tag
|
| 482 |
after = text.split('<think>', 1)[1]
|
| 483 |
thought_buf += after
|
| 484 |
# If closing tag in same chunk
|
| 485 |
if '</think>' in thought_buf:
|
| 486 |
before, after2 = thought_buf.split('</think>', 1)
|
| 487 |
+
history[-1] = (None, f"π {before.strip()}")
|
| 488 |
in_thought = False
|
| 489 |
logger.info("π Thinking block completed, starting answer")
|
| 490 |
# Start answer buffer
|
| 491 |
answer_buf = after2
|
| 492 |
+
history.append((None, answer_buf))
|
| 493 |
else:
|
| 494 |
+
history[-1] = (None, f"π {thought_buf}")
|
| 495 |
yield history, debug
|
| 496 |
continue
|
| 497 |
|
|
|
|
| 500 |
thought_buf += text
|
| 501 |
if '</think>' in thought_buf:
|
| 502 |
before, after2 = thought_buf.split('</think>', 1)
|
| 503 |
+
history[-1] = (None, f"π {before.strip()}")
|
| 504 |
in_thought = False
|
| 505 |
logger.info("π Thinking block completed, starting answer")
|
| 506 |
# Start answer buffer
|
| 507 |
answer_buf = after2
|
| 508 |
+
history.append((None, answer_buf))
|
| 509 |
else:
|
| 510 |
+
history[-1] = (None, f"π {thought_buf}")
|
| 511 |
yield history, debug
|
| 512 |
continue
|
| 513 |
|
| 514 |
# Stream answer
|
| 515 |
if not answer_buf:
|
| 516 |
logger.info("π Starting answer generation")
|
| 517 |
+
history.append((None, ''))
|
| 518 |
answer_buf += text
|
| 519 |
+
history[-1] = (None, answer_buf)
|
| 520 |
yield history, debug
|
| 521 |
|
| 522 |
gen_thread.join()
|
|
|
|
| 524 |
yield history, debug + prompt_debug
|
| 525 |
except Exception as e:
|
| 526 |
logger.error(f"β Error during generation: {e}")
|
| 527 |
+
history.append((None, f"Error: {e}"))
|
| 528 |
yield history, debug
|
| 529 |
finally:
|
| 530 |
logger.info("π§Ή Cleaning up memory...")
|
|
|
|
| 547 |
with gr.Blocks(title="LLM Inference") as demo:
|
| 548 |
gr.Markdown("## π§ LLM Inference with Web Search")
|
| 549 |
gr.Markdown("Interact with the model. Select parameters and chat below.")
|
| 550 |
+
gr.Markdown("π‘ **Tip**: If you get access errors, try models like 'Qwen2.5-3B-Instruct' or 'microsoft-DialoGPT-medium' which are publicly accessible.")
|
| 551 |
with gr.Row():
|
| 552 |
with gr.Column(scale=3):
|
| 553 |
model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
|
|
|
|
| 566 |
clr = gr.Button("Clear Chat")
|
| 567 |
cnl = gr.Button("Cancel Generation")
|
| 568 |
with gr.Column(scale=7):
|
| 569 |
+
chat = gr.Chatbot()
|
| 570 |
txt = gr.Textbox(placeholder="Type your message and press Enter...")
|
| 571 |
dbg = gr.Markdown()
|
| 572 |
|
|
|
|
| 578 |
model_dd, max_tok, temp, k, p, rp, st],
|
| 579 |
outputs=[chat, dbg])
|
| 580 |
logger.info("π Starting Gradio application...")
|
| 581 |
+
try:
|
| 582 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
| 583 |
+
except Exception as e:
|
| 584 |
+
logger.error(f"β Failed to launch Gradio app: {e}")
|
| 585 |
+
raise
|