ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 10

Commit

15b78c7

verified ·

1 Parent(s): 4addcb2

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -17

app.py CHANGED Viewed

@@ -27,24 +27,24 @@ cancel_event = threading.Event()
 # ------------------------------
 MODELS = {
     # ~30.5B total parameters (MoE: 3.3B activated)
-    "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit": {
-        "repo_id": "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit",
-        "description": "4-bit AWQ quantized instruct-tuned MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
-    },
-    "Qwen3-30B-A3B-Thinking-2507-AWQ-4bit": {
-        "repo_id": "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
-        "description": "4-bit AWQ quantized thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
-    },
-    # ~80B total parameters (MoE: 3B activated)
-    "Qwen3-Next-80B-A3B-Instruct-AWQ-4bit": {
-        "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit",
-        "description": "4-bit AWQ quantized instruct model from the Qwen3-Next series. Features 80B total parameters (3B activated), hybrid Gated DeltaNet + Gated Attention architecture, 512 experts (10 activated + 1 shared), and native 262,144-token context (extendable to 1M tokens with YaRN). Delivers performance comparable to Qwen3-235B on many benchmarks while offering superior ultra-long-context efficiency. Supports only non-thinking mode. Note: May require re-quantization for stable inference (as of Sept 2025)."
-    },
-    "Qwen3-Next-80B-A3B-Thinking-AWQ-4bit": {
-        "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Thinking-AWQ-4bit",
-        "description": "4-bit AWQ quantized thinking-mode variant of Qwen3-Next-80B-A3B. Combines 80B total parameters (3B activated), hybrid attention (Gated DeltaNet + Gated Attention), and 512-expert MoE (10 activated + 1 shared) for advanced reasoning over ultra-long contexts (natively 262K, extendable to 1M tokens). Designed for complex problem-solving with automatic reasoning trace generation. Quantized using AWQ; intended for high-end agentic and analytical workloads."
-    },
     # ~235B total parameters (MoE: 22B activated) — included for reference if added later
     # "Qwen3-235B-A22B-Thinking": { ... },

 # ------------------------------
 MODELS = {
     # ~30.5B total parameters (MoE: 3.3B activated)
+    # "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit": {
+    #     "repo_id": "cpatonn/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit",
+    #     "description": "4-bit AWQ quantized instruct-tuned MoE model based on Qwen3-30B-A3B-Instruct-2507. Features 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and native 262,144-token context. Excels in instruction following, logical reasoning, multilingualism, coding, and long-context understanding. Supports only non-thinking mode (no <think> blocks). Quantized using AWQ (W4A16) with lm_head and gating layers preserved in higher precision."
+    # },
+    # "Qwen3-30B-A3B-Thinking-2507-AWQ-4bit": {
+    #     "repo_id": "cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
+    #     "description": "4-bit AWQ quantized thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
+    # },
+    # # ~80B total parameters (MoE: 3B activated)
+    # "Qwen3-Next-80B-A3B-Instruct-AWQ-4bit": {
+    #     "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit",
+    #     "description": "4-bit AWQ quantized instruct model from the Qwen3-Next series. Features 80B total parameters (3B activated), hybrid Gated DeltaNet + Gated Attention architecture, 512 experts (10 activated + 1 shared), and native 262,144-token context (extendable to 1M tokens with YaRN). Delivers performance comparable to Qwen3-235B on many benchmarks while offering superior ultra-long-context efficiency. Supports only non-thinking mode. Note: May require re-quantization for stable inference (as of Sept 2025)."
+    # },
+    # "Qwen3-Next-80B-A3B-Thinking-AWQ-4bit": {
+    #     "repo_id": "cpatonn/Qwen3-Next-80B-A3B-Thinking-AWQ-4bit",
+    #     "description": "4-bit AWQ quantized thinking-mode variant of Qwen3-Next-80B-A3B. Combines 80B total parameters (3B activated), hybrid attention (Gated DeltaNet + Gated Attention), and 512-expert MoE (10 activated + 1 shared) for advanced reasoning over ultra-long contexts (natively 262K, extendable to 1M tokens). Designed for complex problem-solving with automatic reasoning trace generation. Quantized using AWQ; intended for high-end agentic and analytical workloads."
+    # },
     # ~235B total parameters (MoE: 22B activated) — included for reference if added later
     # "Qwen3-235B-A22B-Thinking": { ... },