Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

ab92e0d

1 Parent(s): 8cdf3e1

Improve model size detection: replace ad-hoc string parsing with reliable params_b field in MODELS dict

Browse files

Files changed (1) hide show

app.py +95 -74

app.py CHANGED Viewed

@@ -29,46 +29,56 @@ cancel_event = threading.Event()
 MODELS = {
     "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
         "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
-        "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]]."
     },
     "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
         "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
-        "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]]."
     },
     "Qwen3-32B-FP8": {
         "repo_id": "Qwen/Qwen3-32B-FP8",
-        "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
     },
     # ~30.5B total parameters (MoE: 3.3B activated)
     "Qwen3-30B-A3B-Instruct-2507": {
         "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
-        "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
     },
     "Qwen3-30B-A3B-Thinking-2507": {
         "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
-        "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
     },
     "gpt-oss-20b-BF16": {
         "repo_id": "unsloth/gpt-oss-20b-BF16",
-        "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
     },
     "Qwen3-4B-Instruct-2507": {
         "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
-        "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
     },
     "Apriel-1.5-15b-Thinker": {
         "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
-        "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities."
     },
     # 14.8B total parameters
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
-        "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
     },
     "Qwen/Qwen3-14B-FP8": {
         "repo_id": "Qwen/Qwen3-14B-FP8",
-        "description": "FP8-quantized version of Qwen3-14B for efficient inference."
     },
     # ~15B (commented out in original, but larger than 14B)
@@ -83,34 +93,41 @@ MODELS = {
     # 4.3B
     "Phi-4-mini-Reasoning": {
         "repo_id": "microsoft/Phi-4-mini-reasoning",
-        "description": "Phi-4-mini-Reasoning (4.3B parameters)"
     },
     "Phi-4-mini-Instruct": {
         "repo_id": "microsoft/Phi-4-mini-instruct",
-        "description": "Phi-4-mini-Instruct (4.3B parameters)"
     },
     # 4.0B
     "Qwen3-4B": {
         "repo_id": "Qwen/Qwen3-4B",
-        "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
     },
     "Gemma-3-4B-IT": {
         "repo_id": "unsloth/gemma-3-4b-it",
-        "description": "Gemma-3-4B-IT"
     },
     "MiniCPM3-4B": {
         "repo_id": "openbmb/MiniCPM3-4B",
-        "description": "MiniCPM3-4B"
     },
     "Gemma-3n-E4B": {
         "repo_id": "google/gemma-3n-E4B",
-        "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
-        "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
     # ~3B
@@ -120,151 +137,181 @@ MODELS = {
     # },
     "Qwen2.5-Taiwan-3B-Reason-GRPO": {
         "repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
-        "description": "Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
     },
     "Llama-3.2-Taiwan-3B-Instruct": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
-        "description": "Llama-3.2-Taiwan-3B-Instruct"
     },
     "Qwen2.5-3B-Instruct": {
         "repo_id": "Qwen/Qwen2.5-3B-Instruct",
-        "description": "Qwen2.5-3B-Instruct"
     },
     "Qwen2.5-Omni-3B": {
         "repo_id": "Qwen/Qwen2.5-Omni-3B",
-        "description": "Qwen2.5-Omni-3B"
     },
     "Granite-4.0-Micro": {
         "repo_id": "ibm-granite/granite-4.0-micro",
-        "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
     },
     # 2.6B
     "LFM2-2.6B": {
         "repo_id": "LiquidAI/LFM2-2.6B",
-        "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
     },
     # 1.7B
     "Qwen3-1.7B": {
         "repo_id": "Qwen/Qwen3-1.7B",
-        "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
     },
     # ~2B (effective)
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
-        "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
     # 1.5B
     "Nemotron-Research-Reasoning-Qwen-1.5B": {
         "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
-        "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
     },
     "Falcon-H1-1.5B-Instruct": {
         "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
-        "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
     },
     "Qwen2.5-Taiwan-1.5B-Instruct": {
         "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
-        "description": "Qwen2.5-Taiwan-1.5B-Instruct"
     },
     # 1.2B
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
-        "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
     },
     # 1.1B
     "Taiwan-ELM-1_1B-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
-        "description": "Taiwan-ELM-1_1B-Instruct"
     },
     # 1B
     "Llama-3.2-Taiwan-1B": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
-        "description": "Llama-3.2-Taiwan base model with 1 B parameters"
     },
     # 700M
     "LFM2-700M": {
         "repo_id": "LiquidAI/LFM2-700M",
-        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
     # 600M
     "Qwen3-0.6B": {
         "repo_id": "Qwen/Qwen3-0.6B",
-        "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
     },
     "Qwen3-0.6B-Taiwan": {
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
-        "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
     # 500M
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
-        "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
     },
     # 360M
     "SmolLM2-360M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
-        "description": "Original SmolLM2‑360M Instruct"
     },
     "SmolLM2-360M-Instruct-TaiwanChat": {
         "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
-        "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
     },
     # 350M
     "LFM2-350M": {
         "repo_id": "LiquidAI/LFM2-350M",
-        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
     # 270M
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
-        "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
     },
     "Gemma-3-Taiwan-270M-it": {
         "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
-        "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
     },
     "gemma-3-270m-it": {
         "repo_id": "google/gemma-3-270m-it",
         "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
     },
     "Taiwan-ELM-270M-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
-        "description": "Taiwan-ELM-270M-Instruct"
     },
     # 135M
     "SmolLM2-135M-multilingual-base": {
         "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
-        "description": "SmolLM2-135M-multilingual-base"
     },
     "SmolLM-135M-Taiwan-Instruct-v1.0": {
         "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
-        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
     },
     "SmolLM2_135M_Grpo_Gsm8k": {
         "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
-        "description": "SmolLM2_135M_Grpo_Gsm8k"
     },
     "SmolLM2-135M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
-        "description": "Original SmolLM2‑135M Instruct"
     },
     "SmolLM2-135M-Instruct-TaiwanChat": {
         "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
-        "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"
     },
 }
@@ -338,34 +385,8 @@ def format_conversation(history, system_prompt, tokenizer):
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
-    # Estimate model size (rough approximation based on model name)
-    model_size = 0
-    if '30B' in model_name or '32B' in model_name:
-        model_size = 30
-    elif '20B' in model_name:
-        model_size = 20
-    elif '15B' in model_name or '14B' in model_name:
-        model_size = 15
-    elif '4B' in model_name or '3B' in model_name:
-        model_size = 4
-    elif '2B' in model_name or '1.7B' in model_name:
-        model_size = 2
-    elif '1.5B' in model_name or '1.2B' in model_name or '1.1B' in model_name:
-        model_size = 1.5
-    elif '1B' in model_name:
-        model_size = 1
-    elif '700M' in model_name or '600M' in model_name:
-        model_size = 0.7
-    elif '500M' in model_name:
-        model_size = 0.5
-    elif '360M' in model_name or '350M' in model_name:
-        model_size = 0.35
-    elif '270M' in model_name:
-        model_size = 0.27
-    elif '135M' in model_name:
-        model_size = 0.135
-    else:
-        model_size = 4  # default
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2

 MODELS = {
     "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
         "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
+        "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]].",
+        "params_b": 80.0
     },
     "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
         "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
+        "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]].",
+        "params_b": 80.0
     },
     "Qwen3-32B-FP8": {
         "repo_id": "Qwen/Qwen3-32B-FP8",
+        "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities.",
+        "params_b": 32.8
     },
     # ~30.5B total parameters (MoE: 3.3B activated)
     "Qwen3-30B-A3B-Instruct-2507": {
         "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision.",
+        "params_b": 30.5
     },
     "Qwen3-30B-A3B-Thinking-2507": {
         "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
+        "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers.",
+        "params_b": 30.5
     },
     "gpt-oss-20b-BF16": {
         "repo_id": "unsloth/gpt-oss-20b-BF16",
+        "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities.",
+        "params_b": 20.0
     },
     "Qwen3-4B-Instruct-2507": {
         "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
+        "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
+        "params_b": 4.0
     },
     "Apriel-1.5-15b-Thinker": {
         "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
+        "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
+        "params_b": 15.0
     },
     # 14.8B total parameters
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
+        "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration.",
+        "params_b": 14.8
     },
     "Qwen/Qwen3-14B-FP8": {
         "repo_id": "Qwen/Qwen3-14B-FP8",
+        "description": "FP8-quantized version of Qwen3-14B for efficient inference.",
+        "params_b": 14.8
     },
     # ~15B (commented out in original, but larger than 14B)
     # 4.3B
     "Phi-4-mini-Reasoning": {
         "repo_id": "microsoft/Phi-4-mini-reasoning",
+        "description": "Phi-4-mini-Reasoning (4.3B parameters)",
+        "params_b": 4.3
     },
     "Phi-4-mini-Instruct": {
         "repo_id": "microsoft/Phi-4-mini-instruct",
+        "description": "Phi-4-mini-Instruct (4.3B parameters)",
+        "params_b": 4.3
     },
     # 4.0B
     "Qwen3-4B": {
         "repo_id": "Qwen/Qwen3-4B",
+        "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning.",
+        "params_b": 4.0
     },
     "Gemma-3-4B-IT": {
         "repo_id": "unsloth/gemma-3-4b-it",
+        "description": "Gemma-3-4B-IT",
+        "params_b": 4.0
     },
     "MiniCPM3-4B": {
         "repo_id": "openbmb/MiniCPM3-4B",
+        "description": "MiniCPM3-4B",
+        "params_b": 4.0
     },
     "Gemma-3n-E4B": {
         "repo_id": "google/gemma-3n-E4B",
+        "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)",
+        "params_b": 4.0
     },
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
+        "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned",
+        "params_b": 4.0
     },
     # ~3B
     # },
     "Qwen2.5-Taiwan-3B-Reason-GRPO": {
         "repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
+        "description": "Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned",
+        "params_b": 3.0
     },
     "Llama-3.2-Taiwan-3B-Instruct": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
+        "description": "Llama-3.2-Taiwan-3B-Instruct",
+        "params_b": 3.0
     },
     "Qwen2.5-3B-Instruct": {
         "repo_id": "Qwen/Qwen2.5-3B-Instruct",
+        "description": "Qwen2.5-3B-Instruct",
+        "params_b": 3.0
     },
     "Qwen2.5-Omni-3B": {
         "repo_id": "Qwen/Qwen2.5-Omni-3B",
+        "description": "Qwen2.5-Omni-3B",
+        "params_b": 3.0
     },
     "Granite-4.0-Micro": {
         "repo_id": "ibm-granite/granite-4.0-micro",
+        "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications.",
+        "params_b": 3.0
     },
     # 2.6B
     "LFM2-2.6B": {
         "repo_id": "LiquidAI/LFM2-2.6B",
+        "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference.",
+        "params_b": 2.6
     },
     # 1.7B
     "Qwen3-1.7B": {
         "repo_id": "Qwen/Qwen3-1.7B",
+        "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages.",
+        "params_b": 1.7
     },
     # ~2B (effective)
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
+        "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)",
+        "params_b": 2.0
     },
     # 1.5B
     "Nemotron-Research-Reasoning-Qwen-1.5B": {
         "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
+        "description": "Nemotron-Research-Reasoning-Qwen-1.5B",
+        "params_b": 1.5
     },
     "Falcon-H1-1.5B-Instruct": {
         "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
+        "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned",
+        "params_b": 1.5
     },
     "Qwen2.5-Taiwan-1.5B-Instruct": {
         "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
+        "description": "Qwen2.5-Taiwan-1.5B-Instruct",
+        "params_b": 1.5
     },
     # 1.2B
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
+        "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks.",
+        "params_b": 1.2
     },
     # 1.1B
     "Taiwan-ELM-1_1B-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
+        "description": "Taiwan-ELM-1_1B-Instruct",
+        "params_b": 1.1
     },
     # 1B
     "Llama-3.2-Taiwan-1B": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
+        "description": "Llama-3.2-Taiwan base model with 1 B parameters",
+        "params_b": 1.0
     },
     # 700M
     "LFM2-700M": {
         "repo_id": "LiquidAI/LFM2-700M",
+        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions.",
+        "params_b": 0.7
     },
     # 600M
     "Qwen3-0.6B": {
         "repo_id": "Qwen/Qwen3-0.6B",
+        "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities.",
+        "params_b": 0.6
     },
     "Qwen3-0.6B-Taiwan": {
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
+        "description": "Qwen3-Taiwan model with 0.6 B parameters",
+        "params_b": 0.6
     },
     # 500M
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
+        "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned",
+        "params_b": 0.5
     },
     # 360M
     "SmolLM2-360M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
+        "description": "Original SmolLM2‑360M Instruct",
+        "params_b": 0.36
     },
     "SmolLM2-360M-Instruct-TaiwanChat": {
         "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
+        "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat",
+        "params_b": 0.36
     },
     # 350M
     "LFM2-350M": {
         "repo_id": "LiquidAI/LFM2-350M",
+        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3.",
+        "params_b": 0.35
     },
     # 270M
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
+        "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments.",
+        "params_b": 0.27
     },
     "Gemma-3-Taiwan-270M-it": {
         "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
+        "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset",
+        "params_b": 0.27
     },
     "gemma-3-270m-it": {
         "repo_id": "google/gemma-3-270m-it",
         "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
+        "params_b": 0.27
     },
     "Taiwan-ELM-270M-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
+        "description": "Taiwan-ELM-270M-Instruct",
+        "params_b": 0.27
     },
     # 135M
     "SmolLM2-135M-multilingual-base": {
         "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
+        "description": "SmolLM2-135M-multilingual-base",
+        "params_b": 0.135
     },
     "SmolLM-135M-Taiwan-Instruct-v1.0": {
         "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
+        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks",
+        "params_b": 0.135
     },
     "SmolLM2_135M_Grpo_Gsm8k": {
         "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
+        "description": "SmolLM2_135M_Grpo_Gsm8k",
+        "params_b": 0.135
     },
     "SmolLM2-135M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
+        "description": "Original SmolLM2‑135M Instruct",
+        "params_b": 0.135
     },
     "SmolLM2-135M-Instruct-TaiwanChat": {
         "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
+        "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat",
+        "params_b": 0.135
     },
 }
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
+    # Get model size from the MODELS dict (more reliable than string parsing)
+    model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2