Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

App Files Files Community

gpt-oss-120b-chatbot / app.py

mahdicv

reverting to chat.completion API

a93f838 about 2 months ago

raw

history blame contribute delete

3.92 kB

	import os, re, logging, gradio as gr
	from openai import OpenAI
	from gateway import request_generation
	from utils import LATEX_DELIMS

	openai_api_key = os.getenv("API_KEY")
	openai_api_base = os.getenv("API_ENDPOINT")
	MODEL = os.getenv("MODEL_NAME", "")
	client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
	MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
	CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
	QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

	logging.basicConfig(level=logging.INFO)

	def format_analysis_response(text):
	m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
	if m:
	reasoning = m.group(1).strip()
	response = text.split("assistantfinal", 1)[-1].strip()
	return (
	f"🤔 Analysis:\n\n{reasoning}\n\n---\n\n"
	f"💬 Response:\n\n{response}"
	)
	return text.strip()

	def generate(message, history,
	system_prompt, temperature,
	frequency_penalty, presence_penalty,
	max_new_tokens):

	if not message.strip():
	yield "Please enter a prompt."
	return

	msgs = []
	for h in history:
	if isinstance(h, dict):
	msgs.append(h)
	elif isinstance(h, (list, tuple)) and len(h) == 2:
	u, a = h
	if u: msgs.append({"role": "user", "content": u})
	if a: msgs.append({"role": "assistant", "content": a})

	logging.info(f"[User] {message}")
	logging.info(f"[System] {system_prompt} \| Temp={temperature}")

	collected, buffer = "", ""
	yielded_once = False

	try:
	for delta in request_generation(
	api_key=openai_api_key, api_base=openai_api_base,
	message=message, system_prompt=system_prompt,
	model_name=MODEL, chat_history=msgs,
	temperature=temperature,
	frequency_penalty=frequency_penalty,
	presence_penalty=presence_penalty,
	max_new_tokens=max_new_tokens,
	):
	if not delta:
	continue

	collected += delta
	buffer += delta

	if not yielded_once:
	yield delta
	buffer = ""
	yielded_once = True
	continue

	if "\n" in buffer or len(buffer) > 150:
	yield collected
	buffer = ""

	final = format_analysis_response(collected)
	if final.count("$") % 2:
	final += "$"
	yield final

	except Exception as e:
	logging.exception("Stream failed")
	yield f"❌ Error: {e}"

	chatbot_ui = gr.ChatInterface(
	fn=generate,
	type="messages",
	chatbot=gr.Chatbot(
	label="OSS vLLM Chatbot",
	type="messages",
	scale=2,
	height=600,
	latex_delimiters=LATEX_DELIMS,
	),
	stop_btn=True,
	additional_inputs=[
	gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
	gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
	],
	examples=[
	["Explain the difference between supervised and unsupervised learning."],
	["Summarize the plot of Inception in two sentences."],
	["Show me the LaTeX for the quadratic formula."],
	["What are advantages of AMD Instinct MI300X GPU?"],
	["Derive the gradient of softmax cross-entropy loss."],
	["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
	],
	# title="Open-source GPT-OSS-120B on AMD MI300X",
	title=" GPT-OSS-120B on AMD MI300X",
	description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
	)
	if __name__ == "__main__":
	chatbot_ui.queue(max_size=QUEUE_SIZE,
	default_concurrency_limit=CONCURRENCY_LIMIT).launch()