plarnholt commited on
Commit
002f0f2
·
verified ·
1 Parent(s): 0db58b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -47
app.py CHANGED
@@ -1,17 +1,15 @@
1
  """
2
- Docker SDK app for HF Spaces (and local)
3
  - Launches vLLM (OpenAI-compatible) on localhost:API_PORT
4
- - FastAPI proxies /v1/* to vLLM (so clients can use OpenAI SDK / LangChain)
5
- - Gradio chat UI at "/"
6
- - A10G-24GB friendly defaults (Qwen 2.5 14B AWQ, 8k ctx)
7
  """
8
 
9
- import os, time, threading, subprocess, requests, json
10
  from fastapi import FastAPI, Request, Response
11
- from fastapi.responses import JSONResponse
12
  import gradio as gr
13
 
14
- # -------- Config (env overridable) --------
15
  MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
16
  API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
17
  SYSTEM_PROMPT = os.environ.get(
@@ -19,46 +17,41 @@ SYSTEM_PROMPT = os.environ.get(
19
  "You are ExCom AI, a professional assistant that answers precisely and clearly."
20
  )
21
 
22
- # Memory-friendly defaults for A10G (24 GB)
23
  VLLM_ARGS = [
24
  "python3", "-m", "vllm.entrypoints.openai.api_server",
25
  "--model", MODEL_ID,
26
  "--host", "0.0.0.0",
27
  "--port", str(API_PORT),
28
  "--served-model-name", "excom-ai",
29
- "--max-model-len", "8192",
30
  "--gpu-memory-utilization", "0.90",
31
  "--trust-remote-code",
32
  ]
33
  if "AWQ" in MODEL_ID.upper():
34
- # faster AWQ kernel if available
35
- VLLM_ARGS += ["--quantization", "awq_marlin"]
36
 
37
- # -------- vLLM launcher (non-blocking) --------
38
  def launch_vllm():
39
- print(f"[vLLM] Launching with MODEL_ID={MODEL_ID}")
40
  subprocess.Popen(VLLM_ARGS)
41
 
42
  def wait_vllm_ready(timeout=900, interval=3):
43
- base = f"http://127.0.0.1:{API_PORT}/v1/models"
44
  start = time.time()
45
  while time.time() - start < timeout:
46
  try:
47
- r = requests.get(base, timeout=3)
48
  if r.ok:
49
  print("[vLLM] Ready.")
50
  return True
51
  except Exception:
52
  pass
53
  time.sleep(interval)
54
- print("[vLLM] Failed to become ready in time.")
55
  return False
56
 
57
- # Start vLLM in background at process start
58
  threading.Thread(target=launch_vllm, daemon=True).start()
59
  threading.Thread(target=wait_vllm_ready, daemon=True).start()
60
 
61
- # -------- FastAPI app --------
62
  app = FastAPI()
63
 
64
  @app.get("/health")
@@ -69,49 +62,36 @@ def health():
69
  except Exception as e:
70
  return {"upstream_ok": False, "error": str(e)}
71
 
72
- # Minimal proxy for OpenAI-compatible routes
73
  @app.get("/v1/models")
74
  def proxy_models():
75
- r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=20)
76
- return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
77
 
78
  @app.post("/v1/chat/completions")
79
- async def proxy_chat(request: Request):
80
- body = await request.body()
81
  r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
82
  data=body,
83
  headers={"Content-Type": "application/json"},
84
  timeout=600)
85
- return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
86
 
87
- # -------- Gradio UI (messages mode) --------
88
- _ready_flag = {"ok": False}
89
  def ensure_ready():
90
- if _ready_flag["ok"]:
91
- return True
92
- if wait_vllm_ready(timeout=60):
93
- _ready_flag["ok"] = True
94
- return True
95
  return False
96
 
97
  def chat_fn(user_message: str, history: list[dict]):
98
  if not ensure_ready():
99
- return "⏳ Model is loading… please retry in a few seconds."
100
- messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history + [
101
- {"role": "user", "content": user_message}
102
- ]
103
- payload = {"model": "excom-ai", "messages": messages, "temperature": 0.4}
104
- r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
105
- json=payload, timeout=600)
106
  r.raise_for_status()
107
  return r.json()["choices"][0]["message"]["content"]
108
 
109
- demo = gr.ChatInterface(
110
- fn=chat_fn,
111
- title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)",
112
- type="messages",
113
- examples=["Hello", "What can you do?", "Explain ExCom AI in one line."],
114
- )
115
-
116
- # mount Gradio at root
117
- app = gr.mount_gradio_app(app, demo, path="/")
 
1
  """
2
+ HF Spaces (Docker SDK) app
3
  - Launches vLLM (OpenAI-compatible) on localhost:API_PORT
4
+ - FastAPI proxies /v1/* vLLM (so clients can use OpenAI SDK / LangChain)
5
+ - Gradio UI at "/"
6
+ - Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
7
  """
8
 
9
+ import os, time, threading, subprocess, requests
10
  from fastapi import FastAPI, Request, Response
 
11
  import gradio as gr
12
 
 
13
  MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
14
  API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
15
  SYSTEM_PROMPT = os.environ.get(
 
17
  "You are ExCom AI, a professional assistant that answers precisely and clearly."
18
  )
19
 
 
20
  VLLM_ARGS = [
21
  "python3", "-m", "vllm.entrypoints.openai.api_server",
22
  "--model", MODEL_ID,
23
  "--host", "0.0.0.0",
24
  "--port", str(API_PORT),
25
  "--served-model-name", "excom-ai",
26
+ "--max-model-len", "8192", # fits A10G 24GB
27
  "--gpu-memory-utilization", "0.90",
28
  "--trust-remote-code",
29
  ]
30
  if "AWQ" in MODEL_ID.upper():
31
+ VLLM_ARGS += ["--quantization", "awq_marlin"] # faster AWQ kernel if available
 
32
 
 
33
  def launch_vllm():
34
+ print(f"[vLLM] Launch: {MODEL_ID}")
35
  subprocess.Popen(VLLM_ARGS)
36
 
37
  def wait_vllm_ready(timeout=900, interval=3):
38
+ url = f"http://127.0.0.1:{API_PORT}/v1/models"
39
  start = time.time()
40
  while time.time() - start < timeout:
41
  try:
42
+ r = requests.get(url, timeout=3)
43
  if r.ok:
44
  print("[vLLM] Ready.")
45
  return True
46
  except Exception:
47
  pass
48
  time.sleep(interval)
49
+ print("[vLLM] Not ready in time.")
50
  return False
51
 
 
52
  threading.Thread(target=launch_vllm, daemon=True).start()
53
  threading.Thread(target=wait_vllm_ready, daemon=True).start()
54
 
 
55
  app = FastAPI()
56
 
57
  @app.get("/health")
 
62
  except Exception as e:
63
  return {"upstream_ok": False, "error": str(e)}
64
 
 
65
  @app.get("/v1/models")
66
  def proxy_models():
67
+ r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
68
+ return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
69
 
70
  @app.post("/v1/chat/completions")
71
+ async def proxy_chat(req: Request):
72
+ body = await req.body()
73
  r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
74
  data=body,
75
  headers={"Content-Type": "application/json"},
76
  timeout=600)
77
+ return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
78
 
79
+ # -------- Gradio (messages mode) --------
80
+ _ready = {"ok": False}
81
  def ensure_ready():
82
+ if _ready["ok"]: return True
83
+ if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
 
 
 
84
  return False
85
 
86
  def chat_fn(user_message: str, history: list[dict]):
87
  if not ensure_ready():
88
+ return "⏳ Model is loading… please retry shortly."
89
+ messages = [{"role":"system","content":SYSTEM_PROMPT}] + history + [{"role":"user","content":user_message}]
90
+ payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
91
+ r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
 
 
 
92
  r.raise_for_status()
93
  return r.json()["choices"][0]["message"]["content"]
94
 
95
+ ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
96
+ ui.queue()
97
+ app = gr.mount_gradio_app(app, ui, path="/")