Peter Larnholt commited on
Commit
0d3d2c6
·
1 Parent(s): 06f264c

Initial version

Browse files
Files changed (3) hide show
  1. Dockerfile +22 -0
  2. app.py +117 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ HF_HUB_ENABLE_HF_TRANSFER=1
7
+
8
+ # System deps
9
+ RUN apt-get update && apt-get install -y python3 python3-pip git && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+
13
+ COPY requirements.txt /app/
14
+ RUN python3 -m pip install --upgrade pip && pip3 install -r requirements.txt
15
+
16
+ COPY app.py /app/
17
+
18
+ # HF Spaces expects the app to bind $PORT (we’ll default to 7860)
19
+ ENV PORT=7860
20
+ EXPOSE 7860
21
+
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Docker SDK app for HF Spaces (and local)
3
+ - Launches vLLM (OpenAI-compatible) on localhost:API_PORT
4
+ - FastAPI proxies /v1/* to vLLM (so clients can use OpenAI SDK / LangChain)
5
+ - Gradio chat UI at "/"
6
+ - A10G-24GB friendly defaults (Qwen 2.5 14B AWQ, 8k ctx)
7
+ """
8
+
9
+ import os, time, threading, subprocess, requests, json
10
+ from fastapi import FastAPI, Request, Response
11
+ from fastapi.responses import JSONResponse
12
+ import gradio as gr
13
+
14
+ # -------- Config (env overridable) --------
15
+ MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
16
+ API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
17
+ SYSTEM_PROMPT = os.environ.get(
18
+ "SYSTEM_PROMPT",
19
+ "You are ExCom AI, a professional assistant that answers precisely and clearly."
20
+ )
21
+
22
+ # Memory-friendly defaults for A10G (24 GB)
23
+ VLLM_ARGS = [
24
+ "python3", "-m", "vllm.entrypoints.openai.api_server",
25
+ "--model", MODEL_ID,
26
+ "--host", "0.0.0.0",
27
+ "--port", str(API_PORT),
28
+ "--served-model-name", "excom-ai",
29
+ "--max-model-len", "8192",
30
+ "--gpu-memory-utilization", "0.90",
31
+ "--trust-remote-code",
32
+ ]
33
+ if "AWQ" in MODEL_ID.upper():
34
+ # faster AWQ kernel if available
35
+ VLLM_ARGS += ["--quantization", "awq_marlin"]
36
+
37
+ # -------- vLLM launcher (non-blocking) --------
38
+ def launch_vllm():
39
+ print(f"[vLLM] Launching with MODEL_ID={MODEL_ID}")
40
+ subprocess.Popen(VLLM_ARGS)
41
+
42
+ def wait_vllm_ready(timeout=900, interval=3):
43
+ base = f"http://127.0.0.1:{API_PORT}/v1/models"
44
+ start = time.time()
45
+ while time.time() - start < timeout:
46
+ try:
47
+ r = requests.get(base, timeout=3)
48
+ if r.ok:
49
+ print("[vLLM] Ready.")
50
+ return True
51
+ except Exception:
52
+ pass
53
+ time.sleep(interval)
54
+ print("[vLLM] Failed to become ready in time.")
55
+ return False
56
+
57
+ # Start vLLM in background at process start
58
+ threading.Thread(target=launch_vllm, daemon=True).start()
59
+ threading.Thread(target=wait_vllm_ready, daemon=True).start()
60
+
61
+ # -------- FastAPI app --------
62
+ app = FastAPI()
63
+
64
+ @app.get("/health")
65
+ def health():
66
+ try:
67
+ r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
68
+ return {"upstream_ok": r.ok}
69
+ except Exception as e:
70
+ return {"upstream_ok": False, "error": str(e)}
71
+
72
+ # Minimal proxy for OpenAI-compatible routes
73
+ @app.get("/v1/models")
74
+ def proxy_models():
75
+ r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=20)
76
+ return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
77
+
78
+ @app.post("/v1/chat/completions")
79
+ async def proxy_chat(request: Request):
80
+ body = await request.body()
81
+ r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
82
+ data=body,
83
+ headers={"Content-Type": "application/json"},
84
+ timeout=600)
85
+ return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)
86
+
87
+ # -------- Gradio UI (messages mode) --------
88
+ _ready_flag = {"ok": False}
89
+ def ensure_ready():
90
+ if _ready_flag["ok"]:
91
+ return True
92
+ if wait_vllm_ready(timeout=60):
93
+ _ready_flag["ok"] = True
94
+ return True
95
+ return False
96
+
97
+ def chat_fn(user_message: str, history: list[dict]):
98
+ if not ensure_ready():
99
+ return "⏳ Model is loading… please retry in a few seconds."
100
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history + [
101
+ {"role": "user", "content": user_message}
102
+ ]
103
+ payload = {"model": "excom-ai", "messages": messages, "temperature": 0.4}
104
+ r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
105
+ json=payload, timeout=600)
106
+ r.raise_for_status()
107
+ return r.json()["choices"][0]["message"]["content"]
108
+
109
+ demo = gr.ChatInterface(
110
+ fn=chat_fn,
111
+ title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)",
112
+ type="messages",
113
+ examples=["Hello", "What can you do?", "Explain ExCom AI in one line."],
114
+ )
115
+
116
+ # mount Gradio at root
117
+ app = gr.mount_gradio_app(app, demo, path="/")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.111
2
+ uvicorn[standard]>=0.30
3
+ gradio>=4.38
4
+ requests>=2.31
5
+
6
+ # vLLM & friends
7
+ vllm>=0.5.2
8
+ transformers>=4.43
9
+ torch>=2.2
10
+ accelerate