Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 9

Commit

e5a1663

verified ·

1 Parent(s): de64679

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -26

app.py CHANGED Viewed

@@ -26,11 +26,20 @@ cancel_event = threading.Event()
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
-    # Models with 14B+ parameters
-    # "Apriel-1.5-15b-Thinker": {
-    #     "repo_id": "ServiceNow-AI/Apriel-1.5-15b-Thinker",
-    #     "description": "A 15B multimodal reasoning model from ServiceNow’s Apriel series. Achieves SOTA performance on text and image reasoning (52 on Artificial Analysis index, 68 on Tau2 Bench Telecom, 62 on IFBench) despite undergoing only text SFT—no image fine-tuning. Fits on a single GPU and competes with models 10× its size like Deepseek R1 and Gemini-Flash."
-    # },
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
         "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
@@ -40,17 +49,34 @@ MODELS = {
         "description": "FP8-quantized version of Qwen3-14B for efficient inference."
     },
-    # Models with ~5B parameters
     "Apriel-5B-Instruct": {
         "repo_id": "ServiceNow-AI/Apriel-5B-Instruct",
         "description": "A 5B-parameter instruction-tuned model from ServiceNow’s Apriel series, optimized for enterprise tasks and general-purpose instruction following."
     },
-    # Models with 4B–4.3B parameters
     "Qwen3-4B": {
         "repo_id": "Qwen/Qwen3-4B",
         "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
     },
     "Gemma-3-4B-IT": {
         "repo_id": "unsloth/gemma-3-4b-it",
         "description": "Gemma-3-4B-IT"
@@ -63,24 +89,12 @@ MODELS = {
         "repo_id": "google/gemma-3n-E4B",
         "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
-    "Phi-4-mini-Reasoning": {
-        "repo_id": "microsoft/Phi-4-mini-reasoning",
-        "description": "Phi-4-mini-Reasoning (4.3B parameters)"
-    },
-    "Phi-4-mini-Instruct": {
-        "repo_id": "microsoft/Phi-4-mini-instruct",
-        "description": "Phi-4-mini-Instruct (4.3B parameters)"
-    },
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
         "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
-    "Qwen3-4B-Instruct-2507": {
-        "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
-        "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
-    },
-    # Models with ~3B parameters
     "AI21-Jamba-Reasoning-3B": {
         "repo_id": "ai21labs/AI21-Jamba-Reasoning-3B",
         "description": "A compact 3B hybrid Transformer–Mamba reasoning model with 256K context length, strong intelligence benchmark scores (61% MMLU-Pro, 52% IFBench), and efficient inference suitable for edge and datacenter use. Outperforms Gemma-3 4B and Llama-3.2 3B despite smaller size."
@@ -106,23 +120,25 @@ MODELS = {
         "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
     },
-    # Models with 2.6B parameters
     "LFM2-2.6B": {
         "repo_id": "LiquidAI/LFM2-2.6B",
         "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
     },
-    # Models with 1.7B–2B parameters
     "Qwen3-1.7B": {
         "repo_id": "Qwen/Qwen3-1.7B",
         "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
     },
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
         "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
-    # Models with 1B–1.5B parameters
     "Nemotron-Research-Reasoning-Qwen-1.5B": {
         "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
         "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
@@ -135,24 +151,32 @@ MODELS = {
         "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
         "description": "Qwen2.5-Taiwan-1.5B-Instruct"
     },
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
         "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
     },
     "Taiwan-ELM-1_1B-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
         "description": "Taiwan-ELM-1_1B-Instruct"
     },
     "Llama-3.2-Taiwan-1B": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
         "description": "Llama-3.2-Taiwan base model with 1 B parameters"
     },
-    # Models with 700M–360M parameters
     "LFM2-700M": {
         "repo_id": "LiquidAI/LFM2-700M",
         "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
     "Qwen3-0.6B": {
         "repo_id": "Qwen/Qwen3-0.6B",
         "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
@@ -161,10 +185,14 @@ MODELS = {
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
         "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
         "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
     },
     "SmolLM2-360M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
         "description": "Original SmolLM2‑360M Instruct"
@@ -173,12 +201,14 @@ MODELS = {
         "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
         "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
     },
     "LFM2-350M": {
         "repo_id": "LiquidAI/LFM2-350M",
         "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
-    # Models with ~270M parameters
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
         "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
@@ -196,7 +226,7 @@ MODELS = {
         "description": "Taiwan-ELM-270M-Instruct"
     },
-    # Models with ~135M parameters
     "SmolLM2-135M-multilingual-base": {
         "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
         "description": "SmolLM2-135M-multilingual-base"

 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
+    # ~30.5B total parameters (MoE: 3.3B activated)
+    "Qwen3-30B-A3B-Thinking-2507-FP8": {
+        "repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8",
+        "description": "FP8-quantized MoE model with 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Optimized for complex reasoning tasks with enhanced thinking capabilities in mathematics, coding, science, and agent benchmarks. Supports only thinking mode; includes automatic reasoning delimiters."
+    },
+    "Qwen3-30B-A3B-Instruct-2507-FP8": {
+        "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8",
+        "description": "FP8-quantized instruct-tuned variant of Qwen3-30B-A3B (30.5B total params, 3.3B activated), featuring strong general capabilities in instruction following, tool usage, text generation, and 256K long-context understanding. Ideal for agentic and multi-turn dialogue applications."
+    },
+    # ~235B total parameters (MoE: 22B activated) — included for reference if added later
+    # "Qwen3-235B-A22B-Thinking": { ... },
+    # 14.8B total parameters
     "Qwen3-14B": {
         "repo_id": "Qwen/Qwen3-14B",
         "description": "Dense causal language model with 14.8 B total parameters (13.2 B non-embedding), 40 layers, 40 query heads & 8 KV heads, 32 768-token context (131 072 via YaRN), enhanced human preference alignment & advanced agent integration."
         "description": "FP8-quantized version of Qwen3-14B for efficient inference."
     },
+    # ~15B (commented out in original, but larger than 14B)
+    # "Apriel-1.5-15b-Thinker": { ... },
+    # 5B
     "Apriel-5B-Instruct": {
         "repo_id": "ServiceNow-AI/Apriel-5B-Instruct",
         "description": "A 5B-parameter instruction-tuned model from ServiceNow’s Apriel series, optimized for enterprise tasks and general-purpose instruction following."
     },
+    # 4.3B
+    "Phi-4-mini-Reasoning": {
+        "repo_id": "microsoft/Phi-4-mini-reasoning",
+        "description": "Phi-4-mini-Reasoning (4.3B parameters)"
+    },
+    "Phi-4-mini-Instruct": {
+        "repo_id": "microsoft/Phi-4-mini-instruct",
+        "description": "Phi-4-mini-Instruct (4.3B parameters)"
+    },
+    # 4.0B
     "Qwen3-4B": {
         "repo_id": "Qwen/Qwen3-4B",
         "description": "Dense causal language model with 4.0 B total parameters (3.6 B non-embedding), 36 layers, 32 query heads & 8 KV heads, native 32 768-token context (extendable to 131 072 via YaRN), balanced mid-range capacity & long-context reasoning."
     },
+    "Qwen3-4B-Instruct-2507": {
+        "repo_id": "Qwen/Qwen3-4B-Instruct-2507",
+        "description": "Updated non-thinking instruct variant of Qwen3-4B with 4.0B parameters, featuring significant improvements in instruction following, logical reasoning, multilingualism, and 256K long-context understanding. Strong performance across knowledge, coding, alignment, and agent benchmarks."
+    },
     "Gemma-3-4B-IT": {
         "repo_id": "unsloth/gemma-3-4b-it",
         "description": "Gemma-3-4B-IT"
         "repo_id": "google/gemma-3n-E4B",
         "description": "Gemma 3n base model with effective 4 B parameters (≈3 GB VRAM)"
     },
     "SmallThinker-4BA0.6B-Instruct": {
         "repo_id": "PowerInfer/SmallThinker-4BA0.6B-Instruct",
         "description": "SmallThinker 4 B backbone with 0.6 B activated parameters, instruction‑tuned"
     },
+    # ~3B
     "AI21-Jamba-Reasoning-3B": {
         "repo_id": "ai21labs/AI21-Jamba-Reasoning-3B",
         "description": "A compact 3B hybrid Transformer–Mamba reasoning model with 256K context length, strong intelligence benchmark scores (61% MMLU-Pro, 52% IFBench), and efficient inference suitable for edge and datacenter use. Outperforms Gemma-3 4B and Llama-3.2 3B despite smaller size."
         "description": "A 3B-parameter long-context instruct model from IBM, finetuned for enhanced instruction following and tool-calling. Supports 12 languages including English, Chinese, Arabic, and Japanese. Built on a dense Transformer with GQA, RoPE, SwiGLU, and 128K context length. Trained using SFT, RL alignment, and model merging techniques for enterprise applications."
     },
+    # 2.6B
     "LFM2-2.6B": {
         "repo_id": "LiquidAI/LFM2-2.6B",
         "description": "The 2.6B parameter model in the LFM2 series, it outperforms models in the 3B+ class and features a hybrid architecture for faster inference."
     },
+    # 1.7B
     "Qwen3-1.7B": {
         "repo_id": "Qwen/Qwen3-1.7B",
         "description": "Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."
     },
+    # ~2B (effective)
     "Gemma-3n-E2B": {
         "repo_id": "google/gemma-3n-E2B",
         "description": "Gemma 3n base model with effective 2 B parameters (≈2 GB VRAM)"
     },
+    # 1.5B
     "Nemotron-Research-Reasoning-Qwen-1.5B": {
         "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
         "description": "Nemotron-Research-Reasoning-Qwen-1.5B"
         "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
         "description": "Qwen2.5-Taiwan-1.5B-Instruct"
     },
+    # 1.2B
     "LFM2-1.2B": {
         "repo_id": "LiquidAI/LFM2-1.2B",
         "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks."
     },
+    # 1.1B
     "Taiwan-ELM-1_1B-Instruct": {
         "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
         "description": "Taiwan-ELM-1_1B-Instruct"
     },
+    # 1B
     "Llama-3.2-Taiwan-1B": {
         "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
         "description": "Llama-3.2-Taiwan base model with 1 B parameters"
     },
+    # 700M
     "LFM2-700M": {
         "repo_id": "LiquidAI/LFM2-700M",
         "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions."
     },
+    # 600M
     "Qwen3-0.6B": {
         "repo_id": "Qwen/Qwen3-0.6B",
         "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."
         "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
         "description": "Qwen3-Taiwan model with 0.6 B parameters"
     },
+    # 500M
     "Qwen2.5-0.5B-Taiwan-Instruct": {
         "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
         "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned"
     },
+    # 360M
     "SmolLM2-360M-Instruct": {
         "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
         "description": "Original SmolLM2‑360M Instruct"
         "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
         "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat"
     },
+    # 350M
     "LFM2-350M": {
         "repo_id": "LiquidAI/LFM2-350M",
         "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3."
     },
+    # 270M
     "parser_model_ner_gemma_v0.1": {
         "repo_id": "myfi/parser_model_ner_gemma_v0.1",
         "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments."
         "description": "Taiwan-ELM-270M-Instruct"
     },
+    # 135M
     "SmolLM2-135M-multilingual-base": {
         "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
         "description": "SmolLM2-135M-multilingual-base"