Spaces:
Running
Running
Add qwen 80b-a3b
Browse files
app.py
CHANGED
|
@@ -27,6 +27,18 @@ cancel_event = threading.Event()
|
|
| 27 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 28 |
# ------------------------------
|
| 29 |
MODELS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# ~30.5B total parameters (MoE: 3.3B activated)
|
| 31 |
"Qwen3-30B-A3B-Instruct-2507": {
|
| 32 |
"repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
|
|
@@ -36,10 +48,6 @@ MODELS = {
|
|
| 36 |
"repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
|
| 37 |
"description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
|
| 38 |
},
|
| 39 |
-
"Qwen3-32B-FP8": {
|
| 40 |
-
"repo_id": "Qwen/Qwen3-32B-FP8",
|
| 41 |
-
"description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
|
| 42 |
-
},
|
| 43 |
"gpt-oss-20b-BF16": {
|
| 44 |
"repo_id": "unsloth/gpt-oss-20b-BF16",
|
| 45 |
"description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
|
|
|
|
| 27 |
# Torch-Compatible Model Definitions with Adjusted Descriptions
|
| 28 |
# ------------------------------
|
| 29 |
MODELS = {
|
| 30 |
+
"Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": {
|
| 31 |
+
"repo_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
|
| 32 |
+
"description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization for efficiency. Optimized for fast, stable instruction-following dialogue without 'thinking' traces, making it ideal for general chat and low-latency applications [[2]][[3]][[5]][[8]]."
|
| 33 |
+
},
|
| 34 |
+
"Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": {
|
| 35 |
+
"repo_id": "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
|
| 36 |
+
"description": "Sparse Mixture-of-Experts (MoE) causal language model with 80B total parameters and approximately 3B activated per inference step. Features include native 32,768-token context (extendable to 131,072 via YaRN), 16 query heads and 2 KV heads, head dimension of 256, and FP8 quantization. Specialized for complex reasoning, math, and coding tasks, this model outputs structured 'thinking' traces by default and is designed to be used with a reasoning parser [[10]][[11]][[14]][[18]]."
|
| 37 |
+
},
|
| 38 |
+
"Qwen3-32B-FP8": {
|
| 39 |
+
"repo_id": "Qwen/Qwen3-32B-FP8",
|
| 40 |
+
"description": "Dense causal language model with 32.8B total parameters (31.2B non-embedding), 64 layers, 64 query heads & 8 KV heads, native 32,768-token context (extendable to 131,072 via YaRN). Features seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (for efficient dialogue), strong multilingual support (100+ languages), and leading open-source agent capabilities."
|
| 41 |
+
},
|
| 42 |
# ~30.5B total parameters (MoE: 3.3B activated)
|
| 43 |
"Qwen3-30B-A3B-Instruct-2507": {
|
| 44 |
"repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
|
|
|
|
| 48 |
"repo_id": "Qwen/Qwen3-30B-A3B-Thinking-2507",
|
| 49 |
"description": "thinking-mode MoE model based on Qwen3-30B-A3B-Thinking-2507. Contains 30.5B total parameters (3.3B activated), 128 experts (8 activated), 48 layers, and 262,144-token native context. Optimized for deep reasoning in mathematics, science, coding, and agent tasks. Outputs include automatic reasoning delimiters (<think>...</think>). Quantized with AWQ (W4A16), preserving lm_head and expert gating layers."
|
| 50 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
"gpt-oss-20b-BF16": {
|
| 52 |
"repo_id": "unsloth/gpt-oss-20b-BF16",
|
| 53 |
"description": "A 20B-parameter open-source GPT-style language model quantized to INT4 using AutoRound, with FP8 key-value cache for efficient inference. Optimized for performance and memory efficiency on Intel hardware while maintaining strong language generation capabilities."
|