Spaces:
Runtime error
Runtime error
Avijit Ghosh
commited on
Commit
·
00714e7
1
Parent(s):
db1c946
Fix vLLM setup: Use ungated models and add device detection
Browse files- Replace gated meta-llama/Llama-3.1-8B-Instruct with Meta-Llama-3-8B-Instruct (ungated)
- Replace Qwen3-8B with Qwen2.5-7B-Instruct (better INF2 support)
- Replace openai/gpt-oss-20b with mistralai/Mistral-7B-Instruct-v0.3
- Add VLLM_LOGGING_LEVEL=DEBUG environment variable for device detection
- Add --trust-remote-code flag to vLLM command
- Fix missing subprocess.Popen code in vllm-manager.py
- All models are ungated and have confirmed Neuron support
- VLLM_SETUP.md +11 -6
- entrypoint.sh +2 -2
- vllm-manager.py +57 -41
VLLM_SETUP.md
CHANGED
|
@@ -8,19 +8,24 @@ This branch uses vLLM with AWS Neuron support for running models on Amazon INF2
|
|
| 8 |
|
| 9 |
### Available Models
|
| 10 |
|
| 11 |
-
All three models are pre-configured and cached in persistent storage:
|
| 12 |
|
| 13 |
-
|
|
| 14 |
|--------------|-------------------|-------|
|
| 15 |
-
| `
|
| 16 |
-
| `
|
| 17 |
-
| `
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
### Environment Variables
|
| 20 |
|
| 21 |
```bash
|
| 22 |
# Default model to load at startup
|
| 23 |
-
VLLM_MODEL=meta-llama/Llama-3
|
| 24 |
```
|
| 25 |
|
| 26 |
You can change the default startup model, but all three models will be available in the UI regardless.
|
|
|
|
| 8 |
|
| 9 |
### Available Models
|
| 10 |
|
| 11 |
+
All three models are pre-configured and cached in persistent storage (no gated access required):
|
| 12 |
|
| 13 |
+
| Display Name | HuggingFace Model | Notes |
|
| 14 |
|--------------|-------------------|-------|
|
| 15 |
+
| `Llama 3 8B Instruct` | `meta-llama/Meta-Llama-3-8B-Instruct` | Default, best balance, ungated |
|
| 16 |
+
| `Qwen 2.5 7B Instruct` | `Qwen/Qwen2.5-7B-Instruct` | Fast, multilingual, great for coding |
|
| 17 |
+
| `Mistral 7B Instruct v0.3` | `mistralai/Mistral-7B-Instruct-v0.3` | Efficient, good reasoning |
|
| 18 |
+
|
| 19 |
+
**Note**: These models were chosen specifically because they:
|
| 20 |
+
- Don't require gated access or license agreements
|
| 21 |
+
- Have confirmed vLLM + AWS Neuron support
|
| 22 |
+
- Work well on INF2 hardware
|
| 23 |
|
| 24 |
### Environment Variables
|
| 25 |
|
| 26 |
```bash
|
| 27 |
# Default model to load at startup
|
| 28 |
+
VLLM_MODEL=meta-llama/Meta-Llama-3-8B-Instruct
|
| 29 |
```
|
| 30 |
|
| 31 |
You can change the default startup model, but all three models will be available in the UI regardless.
|
entrypoint.sh
CHANGED
|
@@ -21,8 +21,8 @@ echo "Starting vLLM Model Manager"
|
|
| 21 |
mkdir -p /data/models
|
| 22 |
|
| 23 |
# Default model for vLLM (can be overridden via VLLM_MODEL env var)
|
| 24 |
-
# Available models: meta-llama/Llama-3
|
| 25 |
-
export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Llama-3
|
| 26 |
|
| 27 |
# Make manager executable
|
| 28 |
chmod +x /app/vllm-manager.py
|
|
|
|
| 21 |
mkdir -p /data/models
|
| 22 |
|
| 23 |
# Default model for vLLM (can be overridden via VLLM_MODEL env var)
|
| 24 |
+
# Available models: meta-llama/Meta-Llama-3-8B-Instruct, Qwen/Qwen2.5-7B-Instruct, mistralai/Mistral-7B-Instruct-v0.3
|
| 25 |
+
export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Meta-Llama-3-8B-Instruct"}
|
| 26 |
|
| 27 |
# Make manager executable
|
| 28 |
chmod +x /app/vllm-manager.py
|
vllm-manager.py
CHANGED
|
@@ -14,65 +14,81 @@ import requests
|
|
| 14 |
|
| 15 |
# Model configurations
|
| 16 |
MODELS = {
|
| 17 |
-
"meta-llama/Llama-3
|
| 18 |
-
"
|
| 19 |
-
"
|
| 20 |
-
"description": "Meta's Llama 3
|
| 21 |
},
|
| 22 |
-
"Qwen/
|
| 23 |
-
"
|
| 24 |
-
"
|
| 25 |
-
"description": "Alibaba's Qwen
|
| 26 |
},
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"description": "
|
| 31 |
}
|
| 32 |
}
|
| 33 |
|
| 34 |
# Current state
|
| 35 |
-
current_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3
|
| 36 |
vllm_process = None
|
| 37 |
cache_dir = "/data/models"
|
| 38 |
|
| 39 |
-
def start_vllm(
|
| 40 |
-
"""Start vLLM
|
| 41 |
-
global vllm_process
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
cmd = [
|
| 46 |
"python3", "-m", "vllm.entrypoints.openai.api_server",
|
| 47 |
-
"--model",
|
| 48 |
"--host", "0.0.0.0",
|
| 49 |
-
"--port", "8001",
|
| 50 |
"--device", "neuron",
|
| 51 |
"--tensor-parallel-size", "2",
|
| 52 |
-
"--download-dir",
|
|
|
|
| 53 |
]
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
| 76 |
|
| 77 |
def stop_vllm():
|
| 78 |
"""Stop the current vLLM process"""
|
|
|
|
| 14 |
|
| 15 |
# Model configurations
|
| 16 |
MODELS = {
|
| 17 |
+
"meta-llama/Meta-Llama-3-8B-Instruct": {
|
| 18 |
+
"hf_model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
| 19 |
+
"display_name": "Llama 3 8B Instruct",
|
| 20 |
+
"description": "Meta's Llama 3 8B instruction-tuned model (ungated)"
|
| 21 |
},
|
| 22 |
+
"Qwen/Qwen2.5-7B-Instruct": {
|
| 23 |
+
"hf_model": "Qwen/Qwen2.5-7B-Instruct",
|
| 24 |
+
"display_name": "Qwen 2.5 7B Instruct",
|
| 25 |
+
"description": "Alibaba's Qwen 2.5 7B instruction-tuned model"
|
| 26 |
},
|
| 27 |
+
"mistralai/Mistral-7B-Instruct-v0.3": {
|
| 28 |
+
"hf_model": "mistralai/Mistral-7B-Instruct-v0.3",
|
| 29 |
+
"display_name": "Mistral 7B Instruct v0.3",
|
| 30 |
+
"description": "Mistral AI's 7B instruction-tuned model"
|
| 31 |
}
|
| 32 |
}
|
| 33 |
|
| 34 |
# Current state
|
| 35 |
+
current_model = os.environ.get("VLLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 36 |
vllm_process = None
|
| 37 |
cache_dir = "/data/models"
|
| 38 |
|
| 39 |
+
def start_vllm(model_name):
|
| 40 |
+
"""Start vLLM with the specified model"""
|
| 41 |
+
global vllm_process, current_model
|
| 42 |
+
|
| 43 |
+
if model_name not in MODELS:
|
| 44 |
+
print(f"Error: Unknown model {model_name}")
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
model_info = MODELS[model_name]
|
| 48 |
+
hf_model = model_info["hf_model"]
|
| 49 |
|
| 50 |
+
print(f"\n{'='*60}")
|
| 51 |
+
print(f"Starting vLLM with model: {model_info['display_name']}")
|
| 52 |
+
print(f"HuggingFace model: {hf_model}")
|
| 53 |
+
print(f"{'='*60}\n")
|
| 54 |
+
|
| 55 |
+
# Set environment variable for device detection
|
| 56 |
+
env = os.environ.copy()
|
| 57 |
+
env["VLLM_LOGGING_LEVEL"] = "DEBUG"
|
| 58 |
|
| 59 |
cmd = [
|
| 60 |
"python3", "-m", "vllm.entrypoints.openai.api_server",
|
| 61 |
+
"--model", hf_model,
|
| 62 |
"--host", "0.0.0.0",
|
| 63 |
+
"--port", "8001",
|
| 64 |
"--device", "neuron",
|
| 65 |
"--tensor-parallel-size", "2",
|
| 66 |
+
"--download-dir", "/data/models",
|
| 67 |
+
"--trust-remote-code"
|
| 68 |
]
|
| 69 |
|
| 70 |
+
try:
|
| 71 |
+
vllm_process = subprocess.Popen(
|
| 72 |
+
cmd,
|
| 73 |
+
stdout=subprocess.PIPE,
|
| 74 |
+
stderr=subprocess.STDOUT,
|
| 75 |
+
universal_newlines=True,
|
| 76 |
+
bufsize=1,
|
| 77 |
+
env=env
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Monitor output in background thread
|
| 81 |
+
def monitor_output():
|
| 82 |
+
for line in vllm_process.stdout:
|
| 83 |
+
print(f"[vLLM] {line.rstrip()}")
|
| 84 |
+
|
| 85 |
+
Thread(target=monitor_output, daemon=True).start()
|
| 86 |
+
|
| 87 |
+
current_model = model_name
|
| 88 |
+
return True
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"Error starting vLLM: {e}")
|
| 91 |
+
return False
|
| 92 |
|
| 93 |
def stop_vllm():
|
| 94 |
"""Stop the current vLLM process"""
|