from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import uvicorn

app = FastAPI()

# --- Konfigurasi Model ---
# Pastikan sudah download model GGUF dari Hugging Face, contoh:
# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf"  # ganti sesuai file lokal

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,          # konteks token
    n_threads=4,         # sesuaikan dengan jumlah CPU core
    n_batch=512          # batch size
)

# --- Schema Request ---
class ChatRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 256

# --- Endpoint Chat ---
@app.post("/chat")
def chat(req: ChatRequest):
    output = llm(
        req.prompt,
        max_tokens=req.max_new_tokens,
        stop=["</s>", "User:", "Assistant:"],
        echo=False
    )
    response = output["choices"][0]["text"].strip()
    return {"response": response}

# --- Root Endpoint ---
@app.get("/")
def root():
    return {"message": "Qwen GGUF FastAPI running 🚀"}

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run("app:app", host="0.0.0.0", port=port)