Avijit Ghosh commited on
Commit
00714e7
·
1 Parent(s): db1c946

Fix vLLM setup: Use ungated models and add device detection

Browse files

- Replace gated meta-llama/Llama-3.1-8B-Instruct with Meta-Llama-3-8B-Instruct (ungated)
- Replace Qwen3-8B with Qwen2.5-7B-Instruct (better INF2 support)
- Replace openai/gpt-oss-20b with mistralai/Mistral-7B-Instruct-v0.3
- Add VLLM_LOGGING_LEVEL=DEBUG environment variable for device detection
- Add --trust-remote-code flag to vLLM command
- Fix missing subprocess.Popen code in vllm-manager.py
- All models are ungated and have confirmed Neuron support

Files changed (3) hide show
  1. VLLM_SETUP.md +11 -6
  2. entrypoint.sh +2 -2
  3. vllm-manager.py +57 -41
VLLM_SETUP.md CHANGED
@@ -8,19 +8,24 @@ This branch uses vLLM with AWS Neuron support for running models on Amazon INF2
8
 
9
  ### Available Models
10
 
11
- All three models are pre-configured and cached in persistent storage:
12
 
13
- | Ollama Model | HuggingFace Model | Notes |
14
  |--------------|-------------------|-------|
15
- | `llama3.1:8b` | `meta-llama/Llama-3.1-8B-Instruct` | Default, best balance |
16
- | `qwen3:8b` | `Qwen/Qwen3-8B` | Fast, multilingual |
17
- | `gpt-oss:20b` | `openai/gpt-oss-20b` | Larger, more capable |
 
 
 
 
 
18
 
19
  ### Environment Variables
20
 
21
  ```bash
22
  # Default model to load at startup
23
- VLLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
24
  ```
25
 
26
  You can change the default startup model, but all three models will be available in the UI regardless.
 
8
 
9
  ### Available Models
10
 
11
+ All three models are pre-configured and cached in persistent storage (no gated access required):
12
 
13
+ | Display Name | HuggingFace Model | Notes |
14
  |--------------|-------------------|-------|
15
+ | `Llama 3 8B Instruct` | `meta-llama/Meta-Llama-3-8B-Instruct` | Default, best balance, ungated |
16
+ | `Qwen 2.5 7B Instruct` | `Qwen/Qwen2.5-7B-Instruct` | Fast, multilingual, great for coding |
17
+ | `Mistral 7B Instruct v0.3` | `mistralai/Mistral-7B-Instruct-v0.3` | Efficient, good reasoning |
18
+
19
+ **Note**: These models were chosen specifically because they:
20
+ - Don't require gated access or license agreements
21
+ - Have confirmed vLLM + AWS Neuron support
22
+ - Work well on INF2 hardware
23
 
24
  ### Environment Variables
25
 
26
  ```bash
27
  # Default model to load at startup
28
+ VLLM_MODEL=meta-llama/Meta-Llama-3-8B-Instruct
29
  ```
30
 
31
  You can change the default startup model, but all three models will be available in the UI regardless.
entrypoint.sh CHANGED
@@ -21,8 +21,8 @@ echo "Starting vLLM Model Manager"
21
  mkdir -p /data/models
22
 
23
  # Default model for vLLM (can be overridden via VLLM_MODEL env var)
24
- # Available models: meta-llama/Llama-3.1-8B-Instruct, Qwen/Qwen3-8B, openai/gpt-oss-20b
25
- export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
26
 
27
  # Make manager executable
28
  chmod +x /app/vllm-manager.py
 
21
  mkdir -p /data/models
22
 
23
  # Default model for vLLM (can be overridden via VLLM_MODEL env var)
24
+ # Available models: meta-llama/Meta-Llama-3-8B-Instruct, Qwen/Qwen2.5-7B-Instruct, mistralai/Mistral-7B-Instruct-v0.3
25
+ export VLLM_MODEL=${VLLM_MODEL:-"meta-llama/Meta-Llama-3-8B-Instruct"}
26
 
27
  # Make manager executable
28
  chmod +x /app/vllm-manager.py
vllm-manager.py CHANGED
@@ -14,65 +14,81 @@ import requests
14
 
15
  # Model configurations
16
  MODELS = {
17
- "meta-llama/Llama-3.1-8B-Instruct": {
18
- "id": "meta-llama/Llama-3.1-8B-Instruct",
19
- "displayName": "Llama 3.1 8B",
20
- "description": "Meta's Llama 3.1 8B Instruct model"
21
  },
22
- "Qwen/Qwen3-8B": {
23
- "id": "Qwen/Qwen3-8B",
24
- "displayName": "Qwen 3 8B",
25
- "description": "Alibaba's Qwen 3 8B model"
26
  },
27
- "openai/gpt-oss-20b": {
28
- "id": "openai/gpt-oss-20b",
29
- "displayName": "GPT OSS 20B",
30
- "description": "OpenAI's GPT OSS 20B model"
31
  }
32
  }
33
 
34
  # Current state
35
- current_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
36
  vllm_process = None
37
  cache_dir = "/data/models"
38
 
39
- def start_vllm(model_id):
40
- """Start vLLM server with the specified model"""
41
- global vllm_process
 
 
 
 
 
 
 
42
 
43
- print(f"Starting vLLM with model: {model_id}")
 
 
 
 
 
 
 
44
 
45
  cmd = [
46
  "python3", "-m", "vllm.entrypoints.openai.api_server",
47
- "--model", model_id,
48
  "--host", "0.0.0.0",
49
- "--port", "8001", # Use 8001 for actual vLLM
50
  "--device", "neuron",
51
  "--tensor-parallel-size", "2",
52
- "--download-dir", cache_dir
 
53
  ]
54
 
55
- vllm_process = subprocess.Popen(
56
- cmd,
57
- stdout=open("/tmp/vllm.log", "a"),
58
- stderr=subprocess.STDOUT
59
- )
60
-
61
- # Wait for vLLM to be ready
62
- for i in range(120): # 10 minutes timeout
63
- try:
64
- resp = requests.get("http://localhost:8001/health", timeout=1)
65
- if resp.status_code == 200:
66
- print(f"vLLM ready with model: {model_id}")
67
- return True
68
- except:
69
- pass
70
- time.sleep(5)
71
- if i % 6 == 0:
72
- print(f"Waiting for vLLM... ({i*5}s)")
73
-
74
- print("ERROR: vLLM failed to start")
75
- return False
 
76
 
77
  def stop_vllm():
78
  """Stop the current vLLM process"""
 
14
 
15
  # Model configurations
16
  MODELS = {
17
+ "meta-llama/Meta-Llama-3-8B-Instruct": {
18
+ "hf_model": "meta-llama/Meta-Llama-3-8B-Instruct",
19
+ "display_name": "Llama 3 8B Instruct",
20
+ "description": "Meta's Llama 3 8B instruction-tuned model (ungated)"
21
  },
22
+ "Qwen/Qwen2.5-7B-Instruct": {
23
+ "hf_model": "Qwen/Qwen2.5-7B-Instruct",
24
+ "display_name": "Qwen 2.5 7B Instruct",
25
+ "description": "Alibaba's Qwen 2.5 7B instruction-tuned model"
26
  },
27
+ "mistralai/Mistral-7B-Instruct-v0.3": {
28
+ "hf_model": "mistralai/Mistral-7B-Instruct-v0.3",
29
+ "display_name": "Mistral 7B Instruct v0.3",
30
+ "description": "Mistral AI's 7B instruction-tuned model"
31
  }
32
  }
33
 
34
  # Current state
35
+ current_model = os.environ.get("VLLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
36
  vllm_process = None
37
  cache_dir = "/data/models"
38
 
39
+ def start_vllm(model_name):
40
+ """Start vLLM with the specified model"""
41
+ global vllm_process, current_model
42
+
43
+ if model_name not in MODELS:
44
+ print(f"Error: Unknown model {model_name}")
45
+ return False
46
+
47
+ model_info = MODELS[model_name]
48
+ hf_model = model_info["hf_model"]
49
 
50
+ print(f"\n{'='*60}")
51
+ print(f"Starting vLLM with model: {model_info['display_name']}")
52
+ print(f"HuggingFace model: {hf_model}")
53
+ print(f"{'='*60}\n")
54
+
55
+ # Set environment variable for device detection
56
+ env = os.environ.copy()
57
+ env["VLLM_LOGGING_LEVEL"] = "DEBUG"
58
 
59
  cmd = [
60
  "python3", "-m", "vllm.entrypoints.openai.api_server",
61
+ "--model", hf_model,
62
  "--host", "0.0.0.0",
63
+ "--port", "8001",
64
  "--device", "neuron",
65
  "--tensor-parallel-size", "2",
66
+ "--download-dir", "/data/models",
67
+ "--trust-remote-code"
68
  ]
69
 
70
+ try:
71
+ vllm_process = subprocess.Popen(
72
+ cmd,
73
+ stdout=subprocess.PIPE,
74
+ stderr=subprocess.STDOUT,
75
+ universal_newlines=True,
76
+ bufsize=1,
77
+ env=env
78
+ )
79
+
80
+ # Monitor output in background thread
81
+ def monitor_output():
82
+ for line in vllm_process.stdout:
83
+ print(f"[vLLM] {line.rstrip()}")
84
+
85
+ Thread(target=monitor_output, daemon=True).start()
86
+
87
+ current_model = model_name
88
+ return True
89
+ except Exception as e:
90
+ print(f"Error starting vLLM: {e}")
91
+ return False
92
 
93
  def stop_vllm():
94
  """Stop the current vLLM process"""