from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama import os import uvicorn app = FastAPI() # --- Konfigurasi Model --- # Pastikan sudah download model GGUF dari Hugging Face, contoh: # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal llm = Llama( model_path=MODEL_PATH, n_ctx=2048, # konteks token n_threads=4, # sesuaikan dengan jumlah CPU core n_batch=512 # batch size ) # --- Schema Request --- class ChatRequest(BaseModel): prompt: str max_new_tokens: int = 256 # --- Endpoint Chat --- @app.post("/chat") def chat(req: ChatRequest): output = llm( req.prompt, max_tokens=req.max_new_tokens, stop=["", "User:", "Assistant:"], echo=False ) response = output["choices"][0]["text"].strip() return {"response": response} # --- Root Endpoint --- @app.get("/") def root(): return {"message": "Qwen GGUF FastAPI running 🚀"} if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) uvicorn.run("app:app", host="0.0.0.0", port=port)