File size: 4,090 Bytes
0d3d2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Docker SDK app for HF Spaces (and local)
- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
- FastAPI proxies /v1/* to vLLM (so clients can use OpenAI SDK / LangChain)
- Gradio chat UI at "/"
- A10G-24GB friendly defaults (Qwen 2.5 14B AWQ, 8k ctx)
"""

import os, time, threading, subprocess, requests, json
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse
import gradio as gr

# -------- Config (env overridable) --------
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
SYSTEM_PROMPT = os.environ.get(
    "SYSTEM_PROMPT",
    "You are ExCom AI, a professional assistant that answers precisely and clearly."
)

# Memory-friendly defaults for A10G (24 GB)
VLLM_ARGS = [
    "python3", "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_ID,
    "--host", "0.0.0.0",
    "--port", str(API_PORT),
    "--served-model-name", "excom-ai",
    "--max-model-len", "8192",
    "--gpu-memory-utilization", "0.90",
    "--trust-remote-code",
]
if "AWQ" in MODEL_ID.upper():
    # faster AWQ kernel if available
    VLLM_ARGS += ["--quantization", "awq_marlin"]

# -------- vLLM launcher (non-blocking) --------
def launch_vllm():
    print(f"[vLLM] Launching with MODEL_ID={MODEL_ID}")
    subprocess.Popen(VLLM_ARGS)

def wait_vllm_ready(timeout=900, interval=3):
    base = f"http://127.0.0.1:{API_PORT}/v1/models"
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(base, timeout=3)
            if r.ok:
                print("[vLLM] Ready.")
                return True
        except Exception:
            pass
        time.sleep(interval)
    print("[vLLM] Failed to become ready in time.")
    return False

# Start vLLM in background at process start
threading.Thread(target=launch_vllm, daemon=True).start()
threading.Thread(target=wait_vllm_ready, daemon=True).start()

# -------- FastAPI app --------
app = FastAPI()

@app.get("/health")
def health():
    try:
        r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
        return {"upstream_ok": r.ok}
    except Exception as e:
        return {"upstream_ok": False, "error": str(e)}

# Minimal proxy for OpenAI-compatible routes
@app.get("/v1/models")
def proxy_models():
    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=20)
    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)

@app.post("/v1/chat/completions")
async def proxy_chat(request: Request):
    body = await request.body()
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                      data=body,
                      headers={"Content-Type": "application/json"},
                      timeout=600)
    return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"), status_code=r.status_code)

# -------- Gradio UI (messages mode) --------
_ready_flag = {"ok": False}
def ensure_ready():
    if _ready_flag["ok"]:
        return True
    if wait_vllm_ready(timeout=60):
        _ready_flag["ok"] = True
        return True
    return False

def chat_fn(user_message: str, history: list[dict]):
    if not ensure_ready():
        return "⏳ Model is loading… please retry in a few seconds."
    messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history + [
        {"role": "user", "content": user_message}
    ]
    payload = {"model": "excom-ai", "messages": messages, "temperature": 0.4}
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                      json=payload, timeout=600)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]

demo = gr.ChatInterface(
    fn=chat_fn,
    title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)",
    type="messages",
    examples=["Hello", "What can you do?", "Explain ExCom AI in one line."],
)

# mount Gradio at root
app = gr.mount_gradio_app(app, demo, path="/")