File size: 1,194 Bytes
2b65d25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import uvicorn
app = FastAPI()
# --- Konfigurasi Model ---
# Pastikan sudah download model GGUF dari Hugging Face, contoh:
# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal
llm = Llama(
model_path=MODEL_PATH,
n_ctx=2048, # konteks token
n_threads=4, # sesuaikan dengan jumlah CPU core
n_batch=512 # batch size
)
# --- Schema Request ---
class ChatRequest(BaseModel):
prompt: str
max_new_tokens: int = 256
# --- Endpoint Chat ---
@app.post("/chat")
def chat(req: ChatRequest):
output = llm(
req.prompt,
max_tokens=req.max_new_tokens,
stop=["</s>", "User:", "Assistant:"],
echo=False
)
response = output["choices"][0]["text"].strip()
return {"response": response}
# --- Root Endpoint ---
@app.get("/")
def root():
return {"message": "Qwen GGUF FastAPI running 🚀"}
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run("app:app", host="0.0.0.0", port=port)
|