pluralchat

Runtime error

Avijit Ghosh commited on 8 days ago

Commit

00714e7

1 Parent(s): db1c946

Fix vLLM setup: Use ungated models and add device detection

- Replace gated meta-llama/Llama-3.1-8B-Instruct with Meta-Llama-3-8B-Instruct (ungated)
- Replace Qwen3-8B with Qwen2.5-7B-Instruct (better INF2 support)
- Replace openai/gpt-oss-20b with mistralai/Mistral-7B-Instruct-v0.3
- Add VLLM_LOGGING_LEVEL=DEBUG environment variable for device detection
- Add --trust-remote-code flag to vLLM command
- Fix missing subprocess.Popen code in vllm-manager.py
- All models are ungated and have confirmed Neuron support

Files changed (3) hide show

VLLM_SETUP.md +11 -6
entrypoint.sh +2 -2
vllm-manager.py +57 -41

VLLM_SETUP.md CHANGED Viewed

@@ -8,19 +8,24 @@ This branch uses vLLM with AWS Neuron support for running models on Amazon INF2
 ### Available Models
-All three models are pre-configured and cached in persistent storage:
-| Ollama Model | HuggingFace Model | Notes |
 |--------------|-------------------|-------|
-| `llama3.1:8b` | `meta-llama/Llama-3.1-8B-Instruct` | Default, best balance |
-| `qwen3:8b` | `Qwen/Qwen3-8B` | Fast, multilingual |
-| `gpt-oss:20b` | `openai/gpt-oss-20b` | Larger, more capable |
 ### Environment Variables
 ```bash
 # Default model to load at startup
-VLLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
 ```
 You can change the default startup model, but all three models will be available in the UI regardless.

 ### Available Models
+All three models are pre-configured and cached in persistent storage (no gated access required):
+| Display Name | HuggingFace Model | Notes |
 |--------------|-------------------|-------|
+| `Llama 3 8B Instruct` | `meta-llama/Meta-Llama-3-8B-Instruct` | Default, best balance, ungated |
+| `Qwen 2.5 7B Instruct` | `Qwen/Qwen2.5-7B-Instruct` | Fast, multilingual, great for coding |
+| `Mistral 7B Instruct v0.3` | `mistralai/Mistral-7B-Instruct-v0.3` | Efficient, good reasoning |
+**Note**: These models were chosen specifically because they:
+- Don't require gated access or license agreements
+- Have confirmed vLLM + AWS Neuron support
+- Work well on INF2 hardware
 ### Environment Variables
 ```bash
 # Default model to load at startup
+VLLM_MODEL=meta-llama/Meta-Llama-3-8B-Instruct
 ```
 You can change the default startup model, but all three models will be available in the UI regardless.

entrypoint.sh CHANGED Viewed

@@ -21,8 +21,8 @@ echo "Starting vLLM Model Manager"
 mkdir -p /data/models
 # Default model for vLLM (can be overridden via VLLM_MODEL env var)
-# Available models: meta-llama/Llama-3.1-8B-Instruct, Qwen/Qwen3-8B, openai/gpt-oss-20b
-export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
 # Make manager executable
 chmod +x /app/vllm-manager.py

 mkdir -p /data/models
 # Default model for vLLM (can be overridden via VLLM_MODEL env var)
+# Available models: meta-llama/Meta-Llama-3-8B-Instruct, Qwen/Qwen2.5-7B-Instruct, mistralai/Mistral-7B-Instruct-v0.3
+export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Meta-Llama-3-8B-Instruct"}
 # Make manager executable
 chmod +x /app/vllm-manager.py

vllm-manager.py CHANGED Viewed

@@ -14,65 +14,81 @@ import requests
 # Model configurations
 MODELS = {
-    "meta-llama/Llama-3.1-8B-Instruct": {
-        "id": "meta-llama/Llama-3.1-8B-Instruct",
-        "displayName": "Llama 3.1 8B",
-        "description": "Meta's Llama 3.1 8B Instruct model"
     },
-    "Qwen/Qwen3-8B": {
-        "id": "Qwen/Qwen3-8B",
-        "displayName": "Qwen 3 8B",
-        "description": "Alibaba's Qwen 3 8B model"
     },
-    "openai/gpt-oss-20b": {
-        "id": "openai/gpt-oss-20b",
-        "displayName": "GPT OSS 20B",
-        "description": "OpenAI's GPT OSS 20B model"
     }
 }
 # Current state
-current_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
 vllm_process = None
 cache_dir = "/data/models"
-def start_vllm(model_id):
-    """Start vLLM server with the specified model"""
-    global vllm_process
-    print(f"Starting vLLM with model: {model_id}")
     cmd = [
         "python3", "-m", "vllm.entrypoints.openai.api_server",
-        "--model", model_id,
         "--host", "0.0.0.0",
-        "--port", "8001",  # Use 8001 for actual vLLM
         "--device", "neuron",
         "--tensor-parallel-size", "2",
-        "--download-dir", cache_dir
     ]
-    vllm_process = subprocess.Popen(
-        cmd,
-        stdout=open("/tmp/vllm.log", "a"),
-        stderr=subprocess.STDOUT
-    )
-    # Wait for vLLM to be ready
-    for i in range(120):  # 10 minutes timeout
-        try:
-            resp = requests.get("http://localhost:8001/health", timeout=1)
-            if resp.status_code == 200:
-                print(f"vLLM ready with model: {model_id}")
-                return True
-        except:
-            pass
-        time.sleep(5)
-        if i % 6 == 0:
-            print(f"Waiting for vLLM... ({i*5}s)")
-    print("ERROR: vLLM failed to start")
-    return False
 def stop_vllm():
     """Stop the current vLLM process"""

 # Model configurations
 MODELS = {
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "hf_model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "display_name": "Llama 3 8B Instruct",
+        "description": "Meta's Llama 3 8B instruction-tuned model (ungated)"
     },
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "hf_model": "Qwen/Qwen2.5-7B-Instruct",
+        "display_name": "Qwen 2.5 7B Instruct",
+        "description": "Alibaba's Qwen 2.5 7B instruction-tuned model"
     },
+    "mistralai/Mistral-7B-Instruct-v0.3": {
+        "hf_model": "mistralai/Mistral-7B-Instruct-v0.3",
+        "display_name": "Mistral 7B Instruct v0.3",
+        "description": "Mistral AI's 7B instruction-tuned model"
     }
 }
 # Current state
+current_model = os.environ.get("VLLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
 vllm_process = None
 cache_dir = "/data/models"
+def start_vllm(model_name):
+    """Start vLLM with the specified model"""
+    global vllm_process, current_model
+    if model_name not in MODELS:
+        print(f"Error: Unknown model {model_name}")
+        return False
+    model_info = MODELS[model_name]
+    hf_model = model_info["hf_model"]
+    print(f"\n{'='*60}")
+    print(f"Starting vLLM with model: {model_info['display_name']}")
+    print(f"HuggingFace model: {hf_model}")
+    print(f"{'='*60}\n")
+    # Set environment variable for device detection
+    env = os.environ.copy()
+    env["VLLM_LOGGING_LEVEL"] = "DEBUG"
     cmd = [
         "python3", "-m", "vllm.entrypoints.openai.api_server",
+        "--model", hf_model,
         "--host", "0.0.0.0",
+        "--port", "8001",
         "--device", "neuron",
         "--tensor-parallel-size", "2",
+        "--download-dir", "/data/models",
+        "--trust-remote-code"
     ]
+    try:
+        vllm_process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+            bufsize=1,
+            env=env
+        )
+        # Monitor output in background thread
+        def monitor_output():
+            for line in vllm_process.stdout:
+                print(f"[vLLM] {line.rstrip()}")
+        Thread(target=monitor_output, daemon=True).start()
+        current_model = model_name
+        return True
+    except Exception as e:
+        print(f"Error starting vLLM: {e}")
+        return False
 def stop_vllm():
     """Stop the current vLLM process"""