Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| from openai import OpenAI | |
| title = None # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME') | |
| description = None | |
| model_config = { | |
| "MODEL_NAME": os.environ.get("MODEL_NAME"), | |
| "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"), | |
| "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"), | |
| "VLLM_API_URL": os.environ.get("VLLM_API_URL"), | |
| "AUTH_TOKEN": os.environ.get("AUTH_TOKEN") | |
| } | |
| # Initialize the OpenAI client with the vLLM API URL and token | |
| client = OpenAI( | |
| api_key=model_config.get('AUTH_TOKEN'), | |
| base_url=model_config.get('VLLM_API_URL') | |
| ) | |
| def chat_fn(message, history): | |
| # Remove any assistant messages with metadata from history | |
| print(f"Original History: {history}") | |
| history = [item for item in history if | |
| not (isinstance(item, dict) and | |
| item.get("role") == "assistant" and | |
| isinstance(item.get("metadata"), dict) and | |
| item.get("metadata", {}).get("title") is not None)] | |
| print(f"Updated History: {history}") | |
| messages = history + [{"role": "user", "content": message}] | |
| print(f"Messages: {messages}") | |
| # Create the streaming response | |
| stream = client.chat.completions.create( | |
| model=model_config.get('MODEL_NAME'), | |
| messages=messages, | |
| temperature=0.8, | |
| stream=True | |
| ) | |
| history.append(gr.ChatMessage( | |
| role="assistant", | |
| content="Thinking...", | |
| metadata={"title": "🧠 Thought"} | |
| )) | |
| output = "" | |
| completion_started = False | |
| for chunk in stream: | |
| # Extract the new content from the delta field | |
| content = getattr(chunk.choices[0].delta, "content", "") | |
| output += content | |
| parts = output.split("[BEGIN FINAL RESPONSE]") | |
| if len(parts) > 1: | |
| if parts[1].endswith("[END FINAL RESPONSE]"): | |
| parts[1] = parts[1].replace("[END FINAL RESPONSE]", "") | |
| if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"): | |
| parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "") | |
| history[-1 if not completion_started else -2] = gr.ChatMessage( | |
| role="assistant", | |
| content=parts[0], | |
| metadata={"title": "🧠 Thought"} | |
| ) | |
| if completion_started: | |
| history[-1] = gr.ChatMessage( | |
| role="assistant", | |
| content=parts[1] | |
| ) | |
| elif len(parts) > 1 and not completion_started: | |
| completion_started = True | |
| history.append(gr.ChatMessage( | |
| role="assistant", | |
| content=parts[1] | |
| )) | |
| # only yield the most recent assistant messages | |
| messages_to_yield = history[-1:] if not completion_started else history[-2:] | |
| yield messages_to_yield | |
| # Add the model display name and Hugging Face URL to the description | |
| # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})" | |
| print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})") | |
| gr.ChatInterface( | |
| chat_fn, | |
| title=title, | |
| description=description, | |
| theme=gr.themes.Default(primary_hue="green"), | |
| type="messages", | |
| ).launch() | |