| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| # Load the model | |
| llm = Llama.from_pretrained( | |
| repo_id="unsloth/phi-4-GGUF", | |
| filename="phi-4-Q4_K_M.gguf", | |
| ) | |
| # Define request model | |
| class ChatRequest(BaseModel): | |
| system_prompt: str | |
| query: str | |
| async def chat(request: ChatRequest): | |
| response = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": request.system_prompt}, | |
| {"role": "user", "content": request.query}, | |
| ] | |
| ) | |
| return {"response": response} | |