Luigi commited on
Commit
e5a1663
·
verified ·
1 Parent(s): de64679

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -26
app.py CHANGED
@@ -26,11 +26,20 @@ cancel_event = threading.Event()
26
  # Torch-Compatible Model Definitions with Adjusted Descriptions
27
  # ------------------------------
28
  MODELS = {
29
- # Models with 14B+ parameters
30
- # "Apriel-1.5-15b-Thinker": {
31
- # "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
32
- # "description": "A 15B multimodal reasoning model from ServiceNow’s Apriel series. Achieves SOTA performance on text and image reasoning (52 on Artificial Analysis index, 68 on Tau2 Bench Telecom, 62 on IFBench) despite undergoing only text SFT—no image fine-tuning. Fits on a single GPU and competes with models 10× its size like Deepseek R1 and Gemini-Flash."
33
- # },
 
 
 
 
 
 
 
 
 
34
  "Qwen3-14B": {
35
  "repo_id": "Qwen/Qwen3-14B",
36
  "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
@@ -40,17 +49,34 @@ MODELS = {
40
  "description": "FP8-quantized version of Qwen3-14B for efficient inference."
41
  },
42
 
43
- # Models with ~5B parameters
 
 
 
44
  "Apriel-5B-Instruct": {
45
  "repo_id": "ServiceNow-AI/Apriel-5B-Instruct",
46
  "description": "A 5B-parameter instruction-tuned model from ServiceNow’s Apriel series, optimized for enterprise tasks and general-purpose instruction following."
47
  },
48
 
49
- # Models with 4B–4.3B parameters
 
 
 
 
 
 
 
 
 
 
50
  "Qwen3-4B": {
51
  "repo_id": "Qwen/Qwen3-4B",
52
  "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
53
  },
 
 
 
 
54
  "Gemma-3-4B-IT": {
55
  "repo_id": "unsloth/gemma-3-4b-it",
56
  "description": "Gemma-3-4B-IT"
@@ -63,24 +89,12 @@ MODELS = {
63
  "repo_id": "google/gemma-3n-E4B",
64
  "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
65
  },
66
- "Phi-4-mini-Reasoning": {
67
- "repo_id": "microsoft/Phi-4-mini-reasoning",
68
- "description": "Phi-4-mini-Reasoning (4.3B parameters)"
69
- },
70
- "Phi-4-mini-Instruct": {
71
- "repo_id": "microsoft/Phi-4-mini-instruct",
72
- "description": "Phi-4-mini-Instruct (4.3B parameters)"
73
- },
74
  "SmallThinker-4BA0.6B-Instruct": {
75
  "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
76
  "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
77
  },
78
- "Qwen3-4B-Instruct-2507": {
79
- "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
80
- "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
81
- },
82
 
83
- # Models with ~3B parameters
84
  "AI21-Jamba-Reasoning-3B": {
85
  "repo_id": "ai21labs/AI21-Jamba-Reasoning-3B",
86
  "description": "A compact 3B hybrid Transformer–Mamba reasoning model with 256K context length, strong intelligence benchmark scores (61% MMLU-Pro, 52% IFBench), and efficient inference suitable for edge and datacenter use. Outperforms Gemma-3 4B and Llama-3.2 3B despite smaller size."
@@ -106,23 +120,25 @@ MODELS = {
106
  "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
107
  },
108
 
109
- # Models with 2.6B parameters
110
  "LFM2-2.6B": {
111
  "repo_id": "LiquidAI/LFM2-2.6B",
112
  "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
113
  },
114
 
115
- # Models with 1.7B–2B parameters
116
  "Qwen3-1.7B": {
117
  "repo_id": "Qwen/Qwen3-1.7B",
118
  "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
119
  },
 
 
120
  "Gemma-3n-E2B": {
121
  "repo_id": "google/gemma-3n-E2B",
122
  "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
123
  },
124
 
125
- # Models with 1B–1.5B parameters
126
  "Nemotron-Research-Reasoning-Qwen-1.5B": {
127
  "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
128
  "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
@@ -135,24 +151,32 @@ MODELS = {
135
  "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
136
  "description": "Qwen2.5-Taiwan-1.5B-Instruct"
137
  },
 
 
138
  "LFM2-1.2B": {
139
  "repo_id": "LiquidAI/LFM2-1.2B",
140
  "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
141
  },
 
 
142
  "Taiwan-ELM-1_1B-Instruct": {
143
  "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
144
  "description": "Taiwan-ELM-1_1B-Instruct"
145
  },
 
 
146
  "Llama-3.2-Taiwan-1B": {
147
  "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
148
  "description": "Llama-3.2-Taiwan base model with 1 B parameters"
149
  },
150
 
151
- # Models with 700M–360M parameters
152
  "LFM2-700M": {
153
  "repo_id": "LiquidAI/LFM2-700M",
154
  "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
155
  },
 
 
156
  "Qwen3-0.6B": {
157
  "repo_id": "Qwen/Qwen3-0.6B",
158
  "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
@@ -161,10 +185,14 @@ MODELS = {
161
  "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
162
  "description": "Qwen3-Taiwan model with 0.6 B parameters"
163
  },
 
 
164
  "Qwen2.5-0.5B-Taiwan-Instruct": {
165
  "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
166
  "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
167
  },
 
 
168
  "SmolLM2-360M-Instruct": {
169
  "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
170
  "description": "Original SmolLM2‑360M Instruct"
@@ -173,12 +201,14 @@ MODELS = {
173
  "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
174
  "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
175
  },
 
 
176
  "LFM2-350M": {
177
  "repo_id": "LiquidAI/LFM2-350M",
178
  "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
179
  },
180
 
181
- # Models with ~270M parameters
182
  "parser_model_ner_gemma_v0.1": {
183
  "repo_id": "myfi/parser_model_ner_gemma_v0.1",
184
  "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
@@ -196,7 +226,7 @@ MODELS = {
196
  "description": "Taiwan-ELM-270M-Instruct"
197
  },
198
 
199
- # Models with ~135M parameters
200
  "SmolLM2-135M-multilingual-base": {
201
  "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
202
  "description": "SmolLM2-135M-multilingual-base"
 
26
  # Torch-Compatible Model Definitions with Adjusted Descriptions
27
  # ------------------------------
28
  MODELS = {
29
+ # ~30.5B total parameters (MoE: 3.3B activated)
30
+ "Qwen3-30B-A3B-Thinking-2507-FP8": {
31
+ "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8",
32
+ "description": "FP8-quantized MoE model with 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Optimized for complex reasoning tasks with enhanced thinking capabilities in mathematics, coding, science, and agent benchmarks. Supports only thinking mode; includes automatic reasoning delimiters."
33
+ },
34
+ "Qwen3-30B-A3B-Instruct-2507-FP8": {
35
+ "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8",
36
+ "description": "FP8-quantized instruct-tuned variant of Qwen3-30B-A3B (30.5B total params, 3.3B activated), featuring strong general capabilities in instruction following, tool usage, text generation, and 256K long-context understanding. Ideal for agentic and multi-turn dialogue applications."
37
+ },
38
+
39
+ # ~235B total parameters (MoE: 22B activated) — included for reference if added later
40
+ # "Qwen3-235B-A22B-Thinking": { ... },
41
+
42
+ # 14.8B total parameters
43
  "Qwen3-14B": {
44
  "repo_id": "Qwen/Qwen3-14B",
45
  "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
 
49
  "description": "FP8-quantized version of Qwen3-14B for efficient inference."
50
  },
51
 
52
+ # ~15B (commented out in original, but larger than 14B)
53
+ # "Apriel-1.5-15b-Thinker": { ... },
54
+
55
+ # 5B
56
  "Apriel-5B-Instruct": {
57
  "repo_id": "ServiceNow-AI/Apriel-5B-Instruct",
58
  "description": "A 5B-parameter instruction-tuned model from ServiceNow’s Apriel series, optimized for enterprise tasks and general-purpose instruction following."
59
  },
60
 
61
+ # 4.3B
62
+ "Phi-4-mini-Reasoning": {
63
+ "repo_id": "microsoft/Phi-4-mini-reasoning",
64
+ "description": "Phi-4-mini-Reasoning (4.3B parameters)"
65
+ },
66
+ "Phi-4-mini-Instruct": {
67
+ "repo_id": "microsoft/Phi-4-mini-instruct",
68
+ "description": "Phi-4-mini-Instruct (4.3B parameters)"
69
+ },
70
+
71
+ # 4.0B
72
  "Qwen3-4B": {
73
  "repo_id": "Qwen/Qwen3-4B",
74
  "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
75
  },
76
+ "Qwen3-4B-Instruct-2507": {
77
+ "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
78
+ "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
79
+ },
80
  "Gemma-3-4B-IT": {
81
  "repo_id": "unsloth/gemma-3-4b-it",
82
  "description": "Gemma-3-4B-IT"
 
89
  "repo_id": "google/gemma-3n-E4B",
90
  "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
91
  },
 
 
 
 
 
 
 
 
92
  "SmallThinker-4BA0.6B-Instruct": {
93
  "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
94
  "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
95
  },
 
 
 
 
96
 
97
+ # ~3B
98
  "AI21-Jamba-Reasoning-3B": {
99
  "repo_id": "ai21labs/AI21-Jamba-Reasoning-3B",
100
  "description": "A compact 3B hybrid Transformer–Mamba reasoning model with 256K context length, strong intelligence benchmark scores (61% MMLU-Pro, 52% IFBench), and efficient inference suitable for edge and datacenter use. Outperforms Gemma-3 4B and Llama-3.2 3B despite smaller size."
 
120
  "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
121
  },
122
 
123
+ # 2.6B
124
  "LFM2-2.6B": {
125
  "repo_id": "LiquidAI/LFM2-2.6B",
126
  "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
127
  },
128
 
129
+ # 1.7B
130
  "Qwen3-1.7B": {
131
  "repo_id": "Qwen/Qwen3-1.7B",
132
  "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
133
  },
134
+
135
+ # ~2B (effective)
136
  "Gemma-3n-E2B": {
137
  "repo_id": "google/gemma-3n-E2B",
138
  "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
139
  },
140
 
141
+ # 1.5B
142
  "Nemotron-Research-Reasoning-Qwen-1.5B": {
143
  "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
144
  "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
 
151
  "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
152
  "description": "Qwen2.5-Taiwan-1.5B-Instruct"
153
  },
154
+
155
+ # 1.2B
156
  "LFM2-1.2B": {
157
  "repo_id": "LiquidAI/LFM2-1.2B",
158
  "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
159
  },
160
+
161
+ # 1.1B
162
  "Taiwan-ELM-1_1B-Instruct": {
163
  "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
164
  "description": "Taiwan-ELM-1_1B-Instruct"
165
  },
166
+
167
+ # 1B
168
  "Llama-3.2-Taiwan-1B": {
169
  "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
170
  "description": "Llama-3.2-Taiwan base model with 1 B parameters"
171
  },
172
 
173
+ # 700M
174
  "LFM2-700M": {
175
  "repo_id": "LiquidAI/LFM2-700M",
176
  "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
177
  },
178
+
179
+ # 600M
180
  "Qwen3-0.6B": {
181
  "repo_id": "Qwen/Qwen3-0.6B",
182
  "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
 
185
  "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
186
  "description": "Qwen3-Taiwan model with 0.6 B parameters"
187
  },
188
+
189
+ # 500M
190
  "Qwen2.5-0.5B-Taiwan-Instruct": {
191
  "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
192
  "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
193
  },
194
+
195
+ # 360M
196
  "SmolLM2-360M-Instruct": {
197
  "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
198
  "description": "Original SmolLM2‑360M Instruct"
 
201
  "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
202
  "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
203
  },
204
+
205
+ # 350M
206
  "LFM2-350M": {
207
  "repo_id": "LiquidAI/LFM2-350M",
208
  "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
209
  },
210
 
211
+ # 270M
212
  "parser_model_ner_gemma_v0.1": {
213
  "repo_id": "myfi/parser_model_ner_gemma_v0.1",
214
  "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
 
226
  "description": "Taiwan-ELM-270M-Instruct"
227
  },
228
 
229
+ # 135M
230
  "SmolLM2-135M-multilingual-base": {
231
  "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
232
  "description": "SmolLM2-135M-multilingual-base"