from fastapi import FastAPI from pydantic import BaseModel from utils import generate_response, tokenizer from error_handler import handle_generation_error from config import MODEL_NAME, API_V1_PORT import time app = FastAPI( title="Phi-3-mini 极速API(v1)", description="基于microsoft/Phi-3-mini-4k-instruct-ONNX,CPU环境最优适配", version="1.0.0" ) class ChatCompletionRequest(BaseModel): model: str = MODEL_NAME messages: list[dict] temperature: float = 0.1 @app.post("/v1/chat/completions", summary="Chat对话接口(兼容OpenAI)") async def chat_completion(request: ChatCompletionRequest): try: user_msg = next((msg for msg in reversed(request.messages) if msg["role"] == "user"), None) if not user_msg: raise ValueError("未检测到用户输入消息") user_input = user_msg["content"] response_text = generate_response([user_input])[0] prompt_tokens = len(tokenizer.encode(user_input, add_special_tokens=False)) completion_tokens = len(tokenizer.encode(response_text, add_special_tokens=False)) return { "id": f"chatcmpl-{hash(user_input)[:8]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [ { "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop", "index": 0 } ], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } } except Exception as e: handle_generation_error(e, user_input) if __name__ == "__main__": import uvicorn uvicorn.run( app, host="0.0.0.0", port=API_V1_PORT, workers=1, log_level="warning" )