Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os, re, logging, gradio as gr | |
| from openai import OpenAI | |
| from gateway import request_generation | |
| from utils import LATEX_DELIMS | |
| openai_api_key = os.getenv("API_KEY") | |
| openai_api_base = os.getenv("API_ENDPOINT") | |
| MODEL = os.getenv("MODEL_NAME", "") | |
| client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) | |
| MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024)) | |
| CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20)) | |
| QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4)) | |
| logging.basicConfig(level=logging.INFO) | |
| def format_analysis_response(text): | |
| m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL) | |
| if m: | |
| reasoning = m.group(1).strip() | |
| response = text.split("assistantfinal", 1)[-1].strip() | |
| return ( | |
| f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n" | |
| f"**💬 Response:**\n\n{response}" | |
| ) | |
| return text.strip() | |
| def generate(message, history, | |
| system_prompt, temperature, | |
| frequency_penalty, presence_penalty, | |
| max_new_tokens): | |
| if not message.strip(): | |
| yield "Please enter a prompt." | |
| return | |
| msgs = [] | |
| for h in history: | |
| if isinstance(h, dict): | |
| msgs.append(h) | |
| elif isinstance(h, (list, tuple)) and len(h) == 2: | |
| u, a = h | |
| if u: msgs.append({"role": "user", "content": u}) | |
| if a: msgs.append({"role": "assistant", "content": a}) | |
| logging.info(f"[User] {message}") | |
| logging.info(f"[System] {system_prompt} | Temp={temperature}") | |
| collected, buffer = "", "" | |
| yielded_once = False | |
| try: | |
| for delta in request_generation( | |
| api_key=openai_api_key, api_base=openai_api_base, | |
| message=message, system_prompt=system_prompt, | |
| model_name=MODEL, chat_history=msgs, | |
| temperature=temperature, | |
| frequency_penalty=frequency_penalty, | |
| presence_penalty=presence_penalty, | |
| max_new_tokens=max_new_tokens, | |
| ): | |
| if not delta: | |
| continue | |
| collected += delta | |
| buffer += delta | |
| if not yielded_once: | |
| yield delta | |
| buffer = "" | |
| yielded_once = True | |
| continue | |
| if "\n" in buffer or len(buffer) > 150: | |
| yield collected | |
| buffer = "" | |
| final = format_analysis_response(collected) | |
| if final.count("$") % 2: | |
| final += "$" | |
| yield final | |
| except Exception as e: | |
| logging.exception("Stream failed") | |
| yield f"❌ Error: {e}" | |
| chatbot_ui = gr.ChatInterface( | |
| fn=generate, | |
| type="messages", | |
| chatbot=gr.Chatbot( | |
| label="OSS vLLM Chatbot", | |
| type="messages", | |
| scale=2, | |
| height=600, | |
| latex_delimiters=LATEX_DELIMS, | |
| ), | |
| stop_btn=True, | |
| additional_inputs=[ | |
| gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2), | |
| gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7), | |
| ], | |
| examples=[ | |
| ["Explain the difference between supervised and unsupervised learning."], | |
| ["Summarize the plot of Inception in two sentences."], | |
| ["Show me the LaTeX for the quadratic formula."], | |
| ["What are advantages of AMD Instinct MI300X GPU?"], | |
| ["Derive the gradient of softmax cross-entropy loss."], | |
| ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."], | |
| ], | |
| # title="Open-source GPT-OSS-120B on AMD MI300X", | |
| title=" GPT-OSS-120B on AMD MI300X", | |
| description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.", | |
| ) | |
| if __name__ == "__main__": | |
| chatbot_ui.queue(max_size=QUEUE_SIZE, | |
| default_concurrency_limit=CONCURRENCY_LIMIT).launch() |