Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import logging | |
| from openai import OpenAI | |
| from typing import List, Generator, Optional | |
| logging.basicConfig(level=logging.INFO) | |
| def request_generation( | |
| api_key: str, | |
| api_base: str, | |
| message: str, | |
| system_prompt: str, | |
| model_name: str, | |
| chat_history: Optional[List[dict]] = None, | |
| temperature: float = 0.3, | |
| frequency_penalty: float = 0.0, | |
| presence_penalty: float = 0.0, | |
| max_new_tokens: int = 1024, | |
| tools: Optional[List[dict]] = None, | |
| tool_choice: Optional[str] = None, | |
| ) -> Generator[str, None, None]: | |
| """ | |
| Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client. | |
| Buffers output to improve LaTeX rendering. | |
| """ | |
| client = OpenAI(api_key=api_key, base_url=api_base) | |
| messages = [{"role": "system", "content": system_prompt}] | |
| if chat_history: | |
| messages.extend(chat_history) | |
| messages.append({"role": "user", "content": message}) | |
| request_args = { | |
| "model": model_name, | |
| "messages": messages, | |
| "temperature": temperature, | |
| "frequency_penalty": frequency_penalty, | |
| "presence_penalty": presence_penalty, | |
| "max_tokens": max_new_tokens, | |
| "stream": True, | |
| } | |
| if tools: | |
| request_args["tools"] = tools | |
| if tool_choice: | |
| request_args["tool_choice"] = tool_choice | |
| logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}") | |
| try: | |
| stream = client.chat.completions.create(**request_args) | |
| collected = "" | |
| buffer = "" | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content or "" | |
| collected += delta | |
| buffer += delta | |
| if "\n" in buffer or len(buffer) > 150: | |
| yield buffer | |
| buffer = "" | |
| if buffer: | |
| yield buffer | |
| except Exception as e: | |
| logging.exception("[Gateway] Streaming failed") | |
| yield f"Error: {e}" |