Spaces:

plarnholt
/

excom-ai-demo

Paused

App Files Files Community

Peter Larnholt commited on Oct 9

Commit

0d3d2c6

1 Parent(s): 06f264c

Initial version

Browse files

Files changed (3) hide show

Dockerfile +22 -0
app.py +117 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HUB_ENABLE_HF_TRANSFER=1
+# System deps
+RUN apt-get update && apt-get install -y python3 python3-pip git && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt /app/
+RUN python3 -m pip install --upgrade pip && pip3 install -r requirements.txt
+COPY app.py /app/
+# HF Spaces expects the app to bind $PORT (we’ll default to 7860)
+ENV PORT=7860
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Docker SDK app for HF Spaces (and local)
+- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
+- FastAPI proxies /v1/* to vLLM (so clients can use OpenAI SDK / LangChain)
+- Gradio chat UI at "/"
+- A10G-24GB friendly defaults (Qwen 2.5 14B AWQ, 8k ctx)
+"""
+import os, time, threading, subprocess, requests, json
+from fastapi import FastAPI, Request, Response
+from fastapi.responses import JSONResponse
+import gradio as gr
+# -------- Config (env overridable) --------
+MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
+API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
+SYSTEM_PROMPT = os.environ.get(
+    "SYSTEM_PROMPT",
+    "You are ExCom AI, a professional assistant that answers precisely and clearly."
+)
+# Memory-friendly defaults for A10G (24 GB)
+VLLM_ARGS = [
+    "python3", "-m", "vllm.entrypoints.openai.api_server",
+    "--model", MODEL_ID,
+    "--host", "0.0.0.0",
+    "--port", str(API_PORT),
+    "--served-model-name", "excom-ai",
+    "--max-model-len", "8192",
+    "--gpu-memory-utilization", "0.90",
+    "--trust-remote-code",
+]
+if "AWQ" in MODEL_ID.upper():
+    # faster AWQ kernel if available
+    VLLM_ARGS += ["--quantization", "awq_marlin"]
+# -------- vLLM launcher (non-blocking) --------
+def launch_vllm():
+    print(f"[vLLM] Launching with MODEL_ID={MODEL_ID}")
+    subprocess.Popen(VLLM_ARGS)
+def wait_vllm_ready(timeout=900, interval=3):
+    base = f"http://127.0.0.1:{API_PORT}/v1/models"
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            r = requests.get(base, timeout=3)
+            if r.ok:
+                print("[vLLM] Ready.")
+                return True
+        except Exception:
+            pass
+        time.sleep(interval)
+    print("[vLLM] Failed to become ready in time.")
+    return False
+# Start vLLM in background at process start
+threading.Thread(target=launch_vllm, daemon=True).start()
+threading.Thread(target=wait_vllm_ready, daemon=True).start()
+# -------- FastAPI app --------
+app = FastAPI()
+@app.get("/health")
+def health():
+    try:
+        r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
+        return {"upstream_ok": r.ok}
+    except Exception as e:
+        return {"upstream_ok": False, "error": str(e)}
+# Minimal proxy for OpenAI-compatible routes
+@app.get("/v1/models")
+def proxy_models():
+    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=20)
+    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
+@app.post("/v1/chat/completions")
+async def proxy_chat(request: Request):
+    body = await request.body()
+    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
+                      data=body,
+                      headers={"Content-Type": "application/json"},
+                      timeout=600)
+    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
+# -------- Gradio UI (messages mode) --------
+_ready_flag = {"ok": False}
+def ensure_ready():
+    if _ready_flag["ok"]:
+        return True
+    if wait_vllm_ready(timeout=60):
+        _ready_flag["ok"] = True
+        return True
+    return False
+def chat_fn(user_message: str, history: list[dict]):
+    if not ensure_ready():
+        return "⏳ Model is loading… please retry in a few seconds."
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history + [
+        {"role": "user", "content": user_message}
+    ]
+    payload = {"model": "excom-ai", "messages": messages, "temperature": 0.4}
+    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
+                      json=payload, timeout=600)
+    r.raise_for_status()
+    return r.json()["choices"][0]["message"]["content"]
+demo = gr.ChatInterface(
+    fn=chat_fn,
+    title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)",
+    type="messages",
+    examples=["Hello", "What can you do?", "Explain ExCom AI in one line."],
+)
+# mount Gradio at root
+app = gr.mount_gradio_app(app, demo, path="/")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi>=0.111
+uvicorn[standard]>=0.30
+gradio>=4.38
+requests>=2.31
+# vLLM & friends
+vllm>=0.5.2
+transformers>=4.43
+torch>=2.2
+accelerate