from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = FastAPI() # Allow frontend requests app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Load model model_name = "openai/gpt-oss-20b" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) class ChatRequest(BaseModel): message: str @app.post("/chat") async def chat(req: ChatRequest): messages = [{"role": "user", "content": req.message}] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=100) reply = tokenizer.decode( outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True ) return {"reply": reply}