Spaces:

plarnholt
/

excom-ai-demo

Paused

App Files Files Community

plarnholt commited on Oct 9

Commit

002f0f2

verified ·

1 Parent(s): 0db58b4

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -47

app.py CHANGED Viewed

@@ -1,17 +1,15 @@
 """
-Docker SDK app for HF Spaces (and local)
 - Launches vLLM (OpenAI-compatible) on localhost:API_PORT
-- FastAPI proxies /v1/* to vLLM (so clients can use OpenAI SDK / LangChain)
-- Gradio chat UI at "/"
-- A10G-24GB friendly defaults (Qwen 2.5 14B AWQ, 8k ctx)
 """
-import os, time, threading, subprocess, requests, json
 from fastapi import FastAPI, Request, Response
-from fastapi.responses import JSONResponse
 import gradio as gr
-# -------- Config (env overridable) --------
 MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
 API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
 SYSTEM_PROMPT = os.environ.get(
@@ -19,46 +17,41 @@ SYSTEM_PROMPT = os.environ.get(
     "You are ExCom AI, a professional assistant that answers precisely and clearly."
 )
-# Memory-friendly defaults for A10G (24 GB)
 VLLM_ARGS = [
     "python3", "-m", "vllm.entrypoints.openai.api_server",
     "--model", MODEL_ID,
     "--host", "0.0.0.0",
     "--port", str(API_PORT),
     "--served-model-name", "excom-ai",
-    "--max-model-len", "8192",
     "--gpu-memory-utilization", "0.90",
     "--trust-remote-code",
 ]
 if "AWQ" in MODEL_ID.upper():
-    # faster AWQ kernel if available
-    VLLM_ARGS += ["--quantization", "awq_marlin"]
-# -------- vLLM launcher (non-blocking) --------
 def launch_vllm():
-    print(f"[vLLM] Launching with MODEL_ID={MODEL_ID}")
     subprocess.Popen(VLLM_ARGS)
 def wait_vllm_ready(timeout=900, interval=3):
-    base = f"http://127.0.0.1:{API_PORT}/v1/models"
     start = time.time()
     while time.time() - start < timeout:
         try:
-            r = requests.get(base, timeout=3)
             if r.ok:
                 print("[vLLM] Ready.")
                 return True
         except Exception:
             pass
         time.sleep(interval)
-    print("[vLLM] Failed to become ready in time.")
     return False
-# Start vLLM in background at process start
 threading.Thread(target=launch_vllm, daemon=True).start()
 threading.Thread(target=wait_vllm_ready, daemon=True).start()
-# -------- FastAPI app --------
 app = FastAPI()
 @app.get("/health")
@@ -69,49 +62,36 @@ def health():
     except Exception as e:
         return {"upstream_ok": False, "error": str(e)}
-# Minimal proxy for OpenAI-compatible routes
 @app.get("/v1/models")
 def proxy_models():
-    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=20)
-    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
 @app.post("/v1/chat/completions")
-async def proxy_chat(request: Request):
-    body = await request.body()
     r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                       data=body,
                       headers={"Content-Type": "application/json"},
                       timeout=600)
-    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
-# -------- Gradio UI (messages mode) --------
-_ready_flag = {"ok": False}
 def ensure_ready():
-    if _ready_flag["ok"]:
-        return True
-    if wait_vllm_ready(timeout=60):
-        _ready_flag["ok"] = True
-        return True
     return False
 def chat_fn(user_message: str, history: list[dict]):
     if not ensure_ready():
-        return "⏳ Model is loading… please retry in a few seconds."
-    messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history + [
-        {"role": "user", "content": user_message}
-    ]
-    payload = {"model": "excom-ai", "messages": messages, "temperature": 0.4}
-    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
-                      json=payload, timeout=600)
     r.raise_for_status()
     return r.json()["choices"][0]["message"]["content"]
-demo = gr.ChatInterface(
-    fn=chat_fn,
-    title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)",
-    type="messages",
-    examples=["Hello", "What can you do?", "Explain ExCom AI in one line."],
-)
-# mount Gradio at root
-app = gr.mount_gradio_app(app, demo, path="/")

 """
+HF Spaces (Docker SDK) app
 - Launches vLLM (OpenAI-compatible) on localhost:API_PORT
+- FastAPI proxies /v1/* → vLLM (so clients can use OpenAI SDK / LangChain)
+- Gradio UI at "/"
+- Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
 """
+import os, time, threading, subprocess, requests
 from fastapi import FastAPI, Request, Response
 import gradio as gr
 MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
 API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
 SYSTEM_PROMPT = os.environ.get(
     "You are ExCom AI, a professional assistant that answers precisely and clearly."
 )
 VLLM_ARGS = [
     "python3", "-m", "vllm.entrypoints.openai.api_server",
     "--model", MODEL_ID,
     "--host", "0.0.0.0",
     "--port", str(API_PORT),
     "--served-model-name", "excom-ai",
+    "--max-model-len", "8192",               # fits A10G 24GB
     "--gpu-memory-utilization", "0.90",
     "--trust-remote-code",
 ]
 if "AWQ" in MODEL_ID.upper():
+    VLLM_ARGS += ["--quantization", "awq_marlin"]  # faster AWQ kernel if available
 def launch_vllm():
+    print(f"[vLLM] Launch: {MODEL_ID}")
     subprocess.Popen(VLLM_ARGS)
 def wait_vllm_ready(timeout=900, interval=3):
+    url = f"http://127.0.0.1:{API_PORT}/v1/models"
     start = time.time()
     while time.time() - start < timeout:
         try:
+            r = requests.get(url, timeout=3)
             if r.ok:
                 print("[vLLM] Ready.")
                 return True
         except Exception:
             pass
         time.sleep(interval)
+    print("[vLLM] Not ready in time.")
     return False
 threading.Thread(target=launch_vllm, daemon=True).start()
 threading.Thread(target=wait_vllm_ready, daemon=True).start()
 app = FastAPI()
 @app.get("/health")
     except Exception as e:
         return {"upstream_ok": False, "error": str(e)}
 @app.get("/v1/models")
 def proxy_models():
+    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
+    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
 @app.post("/v1/chat/completions")
+async def proxy_chat(req: Request):
+    body = await req.body()
     r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                       data=body,
                       headers={"Content-Type": "application/json"},
                       timeout=600)
+    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
+# -------- Gradio (messages mode) --------
+_ready = {"ok": False}
 def ensure_ready():
+    if _ready["ok"]: return True
+    if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
     return False
 def chat_fn(user_message: str, history: list[dict]):
     if not ensure_ready():
+        return "⏳ Model is loading… please retry shortly."
+    messages = [{"role":"system","content":SYSTEM_PROMPT}] + history + [{"role":"user","content":user_message}]
+    payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
+    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
     r.raise_for_status()
     return r.json()["choices"][0]["message"]["content"]
+ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
+ui.queue()
+app = gr.mount_gradio_app(app, ui, path="/")