Luigi commited on
Commit
ab92e0d
·
1 Parent(s): 8cdf3e1

Improve model size detection: replace ad-hoc string parsing with reliable params_b field in MODELS dict

Browse files
Files changed (1) hide show
  1. app.py +95 -74
app.py CHANGED
@@ -29,46 +29,56 @@ cancel_event = threading.Event()
29
  MODELS = {
30
  "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
31
  "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
32
- "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]]."
 
33
  },
34
  "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
35
  "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
36
- "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]]."
 
37
  },
38
  "Qwen3-32B-FP8": {
39
  "repo_id": "Qwen/Qwen3-32B-FP8",
40
- "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
 
41
  },
42
  # ~30.5B total parameters (MoE: 3.3B activated)
43
  "Qwen3-30B-A3B-Instruct-2507": {
44
  "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
45
- "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
 
46
  },
47
  "Qwen3-30B-A3B-Thinking-2507": {
48
  "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
49
- "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
 
50
  },
51
  "gpt-oss-20b-BF16": {
52
  "repo_id": "unsloth/gpt-oss-20b-BF16",
53
- "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
 
54
  },
55
  "Qwen3-4B-Instruct-2507": {
56
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
57
- "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
 
58
  },
59
  "Apriel-1.5-15b-Thinker": {
60
  "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
61
- "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities."
 
62
  },
63
 
64
  # 14.8B total parameters
65
  "Qwen3-14B": {
66
  "repo_id": "Qwen/Qwen3-14B",
67
- "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
 
68
  },
69
  "Qwen/Qwen3-14B-FP8": {
70
  "repo_id": "Qwen/Qwen3-14B-FP8",
71
- "description": "FP8-quantized version of Qwen3-14B for efficient inference."
 
72
  },
73
 
74
  # ~15B (commented out in original, but larger than 14B)
@@ -83,34 +93,41 @@ MODELS = {
83
  # 4.3B
84
  "Phi-4-mini-Reasoning": {
85
  "repo_id": "microsoft/Phi-4-mini-reasoning",
86
- "description": "Phi-4-mini-Reasoning (4.3B parameters)"
 
87
  },
88
  "Phi-4-mini-Instruct": {
89
  "repo_id": "microsoft/Phi-4-mini-instruct",
90
- "description": "Phi-4-mini-Instruct (4.3B parameters)"
 
91
  },
92
 
93
  # 4.0B
94
  "Qwen3-4B": {
95
  "repo_id": "Qwen/Qwen3-4B",
96
- "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
 
97
  },
98
 
99
  "Gemma-3-4B-IT": {
100
  "repo_id": "unsloth/gemma-3-4b-it",
101
- "description": "Gemma-3-4B-IT"
 
102
  },
103
  "MiniCPM3-4B": {
104
  "repo_id": "openbmb/MiniCPM3-4B",
105
- "description": "MiniCPM3-4B"
 
106
  },
107
  "Gemma-3n-E4B": {
108
  "repo_id": "google/gemma-3n-E4B",
109
- "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
 
110
  },
111
  "SmallThinker-4BA0.6B-Instruct": {
112
  "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
113
- "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
 
114
  },
115
 
116
  # ~3B
@@ -120,151 +137,181 @@ MODELS = {
120
  # },
121
  "Qwen2.5-Taiwan-3B-Reason-GRPO": {
122
  "repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
123
- "description": "Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned"
 
124
  },
125
  "Llama-3.2-Taiwan-3B-Instruct": {
126
  "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
127
- "description": "Llama-3.2-Taiwan-3B-Instruct"
 
128
  },
129
  "Qwen2.5-3B-Instruct": {
130
  "repo_id": "Qwen/Qwen2.5-3B-Instruct",
131
- "description": "Qwen2.5-3B-Instruct"
 
132
  },
133
  "Qwen2.5-Omni-3B": {
134
  "repo_id": "Qwen/Qwen2.5-Omni-3B",
135
- "description": "Qwen2.5-Omni-3B"
 
136
  },
137
  "Granite-4.0-Micro": {
138
  "repo_id": "ibm-granite/granite-4.0-micro",
139
- "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
 
140
  },
141
 
142
  # 2.6B
143
  "LFM2-2.6B": {
144
  "repo_id": "LiquidAI/LFM2-2.6B",
145
- "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
 
146
  },
147
 
148
  # 1.7B
149
  "Qwen3-1.7B": {
150
  "repo_id": "Qwen/Qwen3-1.7B",
151
- "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
 
152
  },
153
 
154
  # ~2B (effective)
155
  "Gemma-3n-E2B": {
156
  "repo_id": "google/gemma-3n-E2B",
157
- "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
 
158
  },
159
 
160
  # 1.5B
161
  "Nemotron-Research-Reasoning-Qwen-1.5B": {
162
  "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
163
- "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
 
164
  },
165
  "Falcon-H1-1.5B-Instruct": {
166
  "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
167
- "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned"
 
168
  },
169
  "Qwen2.5-Taiwan-1.5B-Instruct": {
170
  "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
171
- "description": "Qwen2.5-Taiwan-1.5B-Instruct"
 
172
  },
173
 
174
  # 1.2B
175
  "LFM2-1.2B": {
176
  "repo_id": "LiquidAI/LFM2-1.2B",
177
- "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
 
178
  },
179
 
180
  # 1.1B
181
  "Taiwan-ELM-1_1B-Instruct": {
182
  "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
183
- "description": "Taiwan-ELM-1_1B-Instruct"
 
184
  },
185
 
186
  # 1B
187
  "Llama-3.2-Taiwan-1B": {
188
  "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
189
- "description": "Llama-3.2-Taiwan base model with 1 B parameters"
 
190
  },
191
 
192
  # 700M
193
  "LFM2-700M": {
194
  "repo_id": "LiquidAI/LFM2-700M",
195
- "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
 
196
  },
197
 
198
  # 600M
199
  "Qwen3-0.6B": {
200
  "repo_id": "Qwen/Qwen3-0.6B",
201
- "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
 
202
  },
203
  "Qwen3-0.6B-Taiwan": {
204
  "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
205
- "description": "Qwen3-Taiwan model with 0.6 B parameters"
 
206
  },
207
 
208
  # 500M
209
  "Qwen2.5-0.5B-Taiwan-Instruct": {
210
  "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
211
- "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
 
212
  },
213
 
214
  # 360M
215
  "SmolLM2-360M-Instruct": {
216
  "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
217
- "description": "Original SmolLM2‑360M Instruct"
 
218
  },
219
  "SmolLM2-360M-Instruct-TaiwanChat": {
220
  "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
221
- "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
 
222
  },
223
 
224
  # 350M
225
  "LFM2-350M": {
226
  "repo_id": "LiquidAI/LFM2-350M",
227
- "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
 
228
  },
229
 
230
  # 270M
231
  "parser_model_ner_gemma_v0.1": {
232
  "repo_id": "myfi/parser_model_ner_gemma_v0.1",
233
- "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
 
234
  },
235
  "Gemma-3-Taiwan-270M-it": {
236
  "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
237
- "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset"
 
238
  },
239
  "gemma-3-270m-it": {
240
  "repo_id": "google/gemma-3-270m-it",
241
  "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
 
242
  },
243
  "Taiwan-ELM-270M-Instruct": {
244
  "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
245
- "description": "Taiwan-ELM-270M-Instruct"
 
246
  },
247
 
248
  # 135M
249
  "SmolLM2-135M-multilingual-base": {
250
  "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
251
- "description": "SmolLM2-135M-multilingual-base"
 
252
  },
253
  "SmolLM-135M-Taiwan-Instruct-v1.0": {
254
  "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
255
- "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks"
 
256
  },
257
  "SmolLM2_135M_Grpo_Gsm8k": {
258
  "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
259
- "description": "SmolLM2_135M_Grpo_Gsm8k"
 
260
  },
261
  "SmolLM2-135M-Instruct": {
262
  "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
263
- "description": "Original SmolLM2‑135M Instruct"
 
264
  },
265
  "SmolLM2-135M-Instruct-TaiwanChat": {
266
  "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
267
- "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"
 
268
  },
269
  }
270
 
@@ -338,34 +385,8 @@ def format_conversation(history, system_prompt, tokenizer):
338
  return prompt
339
 
340
  def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
341
- # Estimate model size (rough approximation based on model name)
342
- model_size = 0
343
- if '30B' in model_name or '32B' in model_name:
344
- model_size = 30
345
- elif '20B' in model_name:
346
- model_size = 20
347
- elif '15B' in model_name or '14B' in model_name:
348
- model_size = 15
349
- elif '4B' in model_name or '3B' in model_name:
350
- model_size = 4
351
- elif '2B' in model_name or '1.7B' in model_name:
352
- model_size = 2
353
- elif '1.5B' in model_name or '1.2B' in model_name or '1.1B' in model_name:
354
- model_size = 1.5
355
- elif '1B' in model_name:
356
- model_size = 1
357
- elif '700M' in model_name or '600M' in model_name:
358
- model_size = 0.7
359
- elif '500M' in model_name:
360
- model_size = 0.5
361
- elif '360M' in model_name or '350M' in model_name:
362
- model_size = 0.35
363
- elif '270M' in model_name:
364
- model_size = 0.27
365
- elif '135M' in model_name:
366
- model_size = 0.135
367
- else:
368
- model_size = 4 # default
369
 
370
  # Only use AOT for models >= 2B parameters
371
  use_aot = model_size >= 2
 
29
  MODELS = {
30
  "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
31
  "repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
32
+ "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]].",
33
+ "params_b": 80.0
34
  },
35
  "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
36
  "repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
37
+ "description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]].",
38
+ "params_b": 80.0
39
  },
40
  "Qwen3-32B-FP8": {
41
  "repo_id": "Qwen/Qwen3-32B-FP8",
42
+ "description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities.",
43
+ "params_b": 32.8
44
  },
45
  # ~30.5B total parameters (MoE: 3.3B activated)
46
  "Qwen3-30B-A3B-Instruct-2507": {
47
  "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
48
+ "description": "non-thinking-mode MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision.",
49
+ "params_b": 30.5
50
  },
51
  "Qwen3-30B-A3B-Thinking-2507": {
52
  "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
53
+ "description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers.",
54
+ "params_b": 30.5
55
  },
56
  "gpt-oss-20b-BF16": {
57
  "repo_id": "unsloth/gpt-oss-20b-BF16",
58
+ "description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities.",
59
+ "params_b": 20.0
60
  },
61
  "Qwen3-4B-Instruct-2507": {
62
  "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
63
+ "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks.",
64
+ "params_b": 4.0
65
  },
66
  "Apriel-1.5-15b-Thinker": {
67
  "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
68
+ "description": "Multimodal reasoning model with 15B parameters, trained via extensive mid-training on text and image data, and fine-tuned only on text (no image SFT). Achieves competitive performance on reasoning benchmarks like Artificial Analysis (score: 52), Tau2 Bench Telecom (68), and IFBench (62). Supports both text and image understanding, fits on a single GPU, and includes structured reasoning output with tool and function calling capabilities.",
69
+ "params_b": 15.0
70
  },
71
 
72
  # 14.8B total parameters
73
  "Qwen3-14B": {
74
  "repo_id": "Qwen/Qwen3-14B",
75
+ "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration.",
76
+ "params_b": 14.8
77
  },
78
  "Qwen/Qwen3-14B-FP8": {
79
  "repo_id": "Qwen/Qwen3-14B-FP8",
80
+ "description": "FP8-quantized version of Qwen3-14B for efficient inference.",
81
+ "params_b": 14.8
82
  },
83
 
84
  # ~15B (commented out in original, but larger than 14B)
 
93
  # 4.3B
94
  "Phi-4-mini-Reasoning": {
95
  "repo_id": "microsoft/Phi-4-mini-reasoning",
96
+ "description": "Phi-4-mini-Reasoning (4.3B parameters)",
97
+ "params_b": 4.3
98
  },
99
  "Phi-4-mini-Instruct": {
100
  "repo_id": "microsoft/Phi-4-mini-instruct",
101
+ "description": "Phi-4-mini-Instruct (4.3B parameters)",
102
+ "params_b": 4.3
103
  },
104
 
105
  # 4.0B
106
  "Qwen3-4B": {
107
  "repo_id": "Qwen/Qwen3-4B",
108
+ "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning.",
109
+ "params_b": 4.0
110
  },
111
 
112
  "Gemma-3-4B-IT": {
113
  "repo_id": "unsloth/gemma-3-4b-it",
114
+ "description": "Gemma-3-4B-IT",
115
+ "params_b": 4.0
116
  },
117
  "MiniCPM3-4B": {
118
  "repo_id": "openbmb/MiniCPM3-4B",
119
+ "description": "MiniCPM3-4B",
120
+ "params_b": 4.0
121
  },
122
  "Gemma-3n-E4B": {
123
  "repo_id": "google/gemma-3n-E4B",
124
+ "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)",
125
+ "params_b": 4.0
126
  },
127
  "SmallThinker-4BA0.6B-Instruct": {
128
  "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
129
+ "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned",
130
+ "params_b": 4.0
131
  },
132
 
133
  # ~3B
 
137
  # },
138
  "Qwen2.5-Taiwan-3B-Reason-GRPO": {
139
  "repo_id": "benchang1110/Qwen2.5-Taiwan-3B-Reason-GRPO",
140
+ "description": "Qwen2.5-Taiwan model with 3 B parameters, Reason-GRPO fine-tuned",
141
+ "params_b": 3.0
142
  },
143
  "Llama-3.2-Taiwan-3B-Instruct": {
144
  "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
145
+ "description": "Llama-3.2-Taiwan-3B-Instruct",
146
+ "params_b": 3.0
147
  },
148
  "Qwen2.5-3B-Instruct": {
149
  "repo_id": "Qwen/Qwen2.5-3B-Instruct",
150
+ "description": "Qwen2.5-3B-Instruct",
151
+ "params_b": 3.0
152
  },
153
  "Qwen2.5-Omni-3B": {
154
  "repo_id": "Qwen/Qwen2.5-Omni-3B",
155
+ "description": "Qwen2.5-Omni-3B",
156
+ "params_b": 3.0
157
  },
158
  "Granite-4.0-Micro": {
159
  "repo_id": "ibm-granite/granite-4.0-micro",
160
+ "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications.",
161
+ "params_b": 3.0
162
  },
163
 
164
  # 2.6B
165
  "LFM2-2.6B": {
166
  "repo_id": "LiquidAI/LFM2-2.6B",
167
+ "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference.",
168
+ "params_b": 2.6
169
  },
170
 
171
  # 1.7B
172
  "Qwen3-1.7B": {
173
  "repo_id": "Qwen/Qwen3-1.7B",
174
+ "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages.",
175
+ "params_b": 1.7
176
  },
177
 
178
  # ~2B (effective)
179
  "Gemma-3n-E2B": {
180
  "repo_id": "google/gemma-3n-E2B",
181
+ "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)",
182
+ "params_b": 2.0
183
  },
184
 
185
  # 1.5B
186
  "Nemotron-Research-Reasoning-Qwen-1.5B": {
187
  "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
188
+ "description": "Nemotron-Research-Reasoning-Qwen-1.5B",
189
+ "params_b": 1.5
190
  },
191
  "Falcon-H1-1.5B-Instruct": {
192
  "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
193
+ "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned",
194
+ "params_b": 1.5
195
  },
196
  "Qwen2.5-Taiwan-1.5B-Instruct": {
197
  "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
198
+ "description": "Qwen2.5-Taiwan-1.5B-Instruct",
199
+ "params_b": 1.5
200
  },
201
 
202
  # 1.2B
203
  "LFM2-1.2B": {
204
  "repo_id": "LiquidAI/LFM2-1.2B",
205
+ "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks.",
206
+ "params_b": 1.2
207
  },
208
 
209
  # 1.1B
210
  "Taiwan-ELM-1_1B-Instruct": {
211
  "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
212
+ "description": "Taiwan-ELM-1_1B-Instruct",
213
+ "params_b": 1.1
214
  },
215
 
216
  # 1B
217
  "Llama-3.2-Taiwan-1B": {
218
  "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
219
+ "description": "Llama-3.2-Taiwan base model with 1 B parameters",
220
+ "params_b": 1.0
221
  },
222
 
223
  # 700M
224
  "LFM2-700M": {
225
  "repo_id": "LiquidAI/LFM2-700M",
226
+ "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions.",
227
+ "params_b": 0.7
228
  },
229
 
230
  # 600M
231
  "Qwen3-0.6B": {
232
  "repo_id": "Qwen/Qwen3-0.6B",
233
+ "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities.",
234
+ "params_b": 0.6
235
  },
236
  "Qwen3-0.6B-Taiwan": {
237
  "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
238
+ "description": "Qwen3-Taiwan model with 0.6 B parameters",
239
+ "params_b": 0.6
240
  },
241
 
242
  # 500M
243
  "Qwen2.5-0.5B-Taiwan-Instruct": {
244
  "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
245
+ "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned",
246
+ "params_b": 0.5
247
  },
248
 
249
  # 360M
250
  "SmolLM2-360M-Instruct": {
251
  "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
252
+ "description": "Original SmolLM2‑360M Instruct",
253
+ "params_b": 0.36
254
  },
255
  "SmolLM2-360M-Instruct-TaiwanChat": {
256
  "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
257
+ "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat",
258
+ "params_b": 0.36
259
  },
260
 
261
  # 350M
262
  "LFM2-350M": {
263
  "repo_id": "LiquidAI/LFM2-350M",
264
+ "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3.",
265
+ "params_b": 0.35
266
  },
267
 
268
  # 270M
269
  "parser_model_ner_gemma_v0.1": {
270
  "repo_id": "myfi/parser_model_ner_gemma_v0.1",
271
+ "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments.",
272
+ "params_b": 0.27
273
  },
274
  "Gemma-3-Taiwan-270M-it": {
275
  "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
276
+ "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset",
277
+ "params_b": 0.27
278
  },
279
  "gemma-3-270m-it": {
280
  "repo_id": "google/gemma-3-270m-it",
281
  "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
282
+ "params_b": 0.27
283
  },
284
  "Taiwan-ELM-270M-Instruct": {
285
  "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
286
+ "description": "Taiwan-ELM-270M-Instruct",
287
+ "params_b": 0.27
288
  },
289
 
290
  # 135M
291
  "SmolLM2-135M-multilingual-base": {
292
  "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
293
+ "description": "SmolLM2-135M-multilingual-base",
294
+ "params_b": 0.135
295
  },
296
  "SmolLM-135M-Taiwan-Instruct-v1.0": {
297
  "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
298
+ "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks",
299
+ "params_b": 0.135
300
  },
301
  "SmolLM2_135M_Grpo_Gsm8k": {
302
  "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
303
+ "description": "SmolLM2_135M_Grpo_Gsm8k",
304
+ "params_b": 0.135
305
  },
306
  "SmolLM2-135M-Instruct": {
307
  "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
308
+ "description": "Original SmolLM2‑135M Instruct",
309
+ "params_b": 0.135
310
  },
311
  "SmolLM2-135M-Instruct-TaiwanChat": {
312
  "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
313
+ "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat",
314
+ "params_b": 0.135
315
  },
316
  }
317
 
 
385
  return prompt
386
 
387
  def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
388
+ # Get model size from the MODELS dict (more reliable than string parsing)
389
+ model_size = MODELS[model_name].get("params_b", 4.0) # Default to 4B if not found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
  # Only use AOT for models >= 2B parameters
392
  use_aot = model_size >= 2