Spaces:
Paused
Paused
File size: 4,081 Bytes
0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 a19cb21 0d3d2c6 002f0f2 0d3d2c6 002f0f2 65659d1 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 0d3d2c6 002f0f2 fa4aba4 002f0f2 0e074c5 0d3d2c6 002f0f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
"""
HF Spaces (Docker SDK) app
- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
- FastAPI proxies /v1/* → vLLM (so clients can use OpenAI SDK / LangChain)
- Gradio UI at "/"
- Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
"""
import os, time, threading, subprocess, requests
from fastapi import FastAPI, Request, Response
import gradio as gr
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
SYSTEM_PROMPT = os.environ.get(
"SYSTEM_PROMPT",
"You are ExCom AI, a professional assistant that answers precisely and clearly."
)
VLLM_ARGS = [
"python3", "-m", "vllm.entrypoints.openai.api_server",
"--model", MODEL_ID,
"--host", "0.0.0.0",
"--port", str(API_PORT),
"--served-model-name", "excom-ai",
"--max-model-len", "8192", # fits A10G 24GB
"--gpu-memory-utilization", "0.90",
"--trust-remote-code",
"--enable-auto-tool-choice", # Enable tool calling
"--tool-call-parser", "hermes", # Use Hermes parser format for Qwen
]
if "AWQ" in MODEL_ID.upper():
VLLM_ARGS += ["--quantization", "awq_marlin"] # faster AWQ kernel if available
def launch_vllm():
print(f"[vLLM] Launch: {MODEL_ID}")
# Capture stderr to see any crashes/errors during generation
subprocess.Popen(VLLM_ARGS, stderr=subprocess.STDOUT)
def wait_vllm_ready(timeout=900, interval=3):
url = f"http://127.0.0.1:{API_PORT}/v1/models"
start = time.time()
while time.time() - start < timeout:
try:
r = requests.get(url, timeout=3)
if r.ok:
print("[vLLM] Ready.")
return True
except Exception:
pass
time.sleep(interval)
print("[vLLM] Not ready in time.")
return False
threading.Thread(target=launch_vllm, daemon=True).start()
threading.Thread(target=wait_vllm_ready, daemon=True).start()
app = FastAPI()
@app.get("/health")
def health():
try:
r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
return {"upstream_ok": r.ok}
except Exception as e:
return {"upstream_ok": False, "error": str(e)}
@app.get("/v1/models")
def proxy_models():
r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
@app.post("/v1/chat/completions")
async def proxy_chat(req: Request):
body = await req.body()
r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
data=body,
headers={"Content-Type": "application/json"},
timeout=600)
return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)
# -------- Gradio (messages mode) --------
_ready = {"ok": False}
def ensure_ready():
if _ready["ok"]: return True
if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
return False
def chat_fn(user_message: str, history: list[dict]):
if not ensure_ready():
return "⏳ Model is loading… please retry shortly."
# Strip Gradio-specific fields (metadata, options) for OpenAI compatibility
clean_history = [{"role": m["role"], "content": m["content"]} for m in history]
messages = [{"role":"system","content":SYSTEM_PROMPT}] + clean_history + [{"role":"user","content":user_message}]
payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
if not r.ok:
print(f"[ERROR] vLLM returned {r.status_code}: {r.text}")
return f"❌ Error: {r.status_code} - Check logs for details"
return r.json()["choices"][0]["message"]["content"]
ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
ui.queue()
app = gr.mount_gradio_app(app, ui, path="/") |