Spaces:

ServiceNow-AI
/

Apriel-Chat

Running

App Files Files Community

bradnow commited on May 7

Commit

9e185d2

verified ·

1 Parent(s): b4a733b

Showing thoughts separately and remove thoughts when calling inference API

Browse files

Files changed (1) hide show

app.py +55 -17

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import os
 import gradio as gr
 from openai import OpenAI
-title = None  # "ServiceNow-AI Chat"
 description = None
-modelConfig = {
     "MODEL_NAME": os.environ.get("MODEL_NAME"),
     "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
     "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
@@ -15,46 +15,84 @@ modelConfig = {
 # Initialize the OpenAI client with the vLLM API URL and token
 client = OpenAI(
-    api_key=modelConfig.get('AUTH_TOKEN'),
-    base_url=modelConfig.get('VLLM_API_URL')
 )
 def chat_fn(message, history):
-    # Format history as OpenAI expects
-    formatted = [{"role": "user", "content": user} if i % 2 == 0 else {"role": "assistant", "content": assistant}
-                 for i, (user, assistant) in enumerate(history)]
-    formatted.append({"role": "user", "content": message})
     # Create the streaming response
     stream = client.chat.completions.create(
-        model=modelConfig.get('MODEL_NAME'),
-        messages=formatted,
         temperature=0.8,
         stream=True
     )
     output = ""
     for chunk in stream:
         # Extract the new content from the delta field
         content = getattr(chunk.choices[0].delta, "content", "")
         output += content
-        # Yield the current accumulated output, removing "<|end|>" if present
-        if output.endswith("<|end|>"):
-            yield {"role": "assistant", "content": output[:-7]}
-        else:
-            yield {"role": "assistant", "content": output}
 # Add the model display name and Hugging Face URL to the description
 # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
-print(f"Running model {modelConfig.get('MODE_DISPLAY_NAME')} ({modelConfig.get('MODEL_NAME')})")
 gr.ChatInterface(
     chat_fn,
     title=title,
     description=description,
     theme=gr.themes.Default(primary_hue="green"),
-    type="messages"
 ).launch()

 import gradio as gr
 from openai import OpenAI
+title = None  # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME')
 description = None
+model_config = {
     "MODEL_NAME": os.environ.get("MODEL_NAME"),
     "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
     "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
 # Initialize the OpenAI client with the vLLM API URL and token
 client = OpenAI(
+    api_key=model_config.get('AUTH_TOKEN'),
+    base_url=model_config.get('VLLM_API_URL')
 )
 def chat_fn(message, history):
+    # Remove any assistant messages with metadata from history
+    print(f"Original History: {history}")
+    history = [item for item in history if
+               not (isinstance(item, dict) and
+                    item.get("role") == "assistant" and
+                    isinstance(item.get("metadata"), dict) and
+                    item.get("metadata", {}).get("title") is not None)]
+    print(f"Updated History: {history}")
+    messages = history + [{"role": "user", "content": message}]
+    print(f"Messages: {messages}")
     # Create the streaming response
     stream = client.chat.completions.create(
+        model=model_config.get('MODEL_NAME'),
+        messages=messages,
         temperature=0.8,
         stream=True
     )
+    history.append(gr.ChatMessage(
+        role="assistant",
+        content="Thinking...",
+        metadata={"title": "🧠 Thought"}
+    ))
     output = ""
+    completion_started = False
     for chunk in stream:
         # Extract the new content from the delta field
         content = getattr(chunk.choices[0].delta, "content", "")
         output += content
+        parts = output.split("[BEGIN FINAL RESPONSE]")
+        if len(parts) > 1:
+            if parts[1].endswith("[END FINAL RESPONSE]"):
+                parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
+            if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"):
+                parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "")
+        history[-1 if not completion_started else -2] = gr.ChatMessage(
+            role="assistant",
+            content=parts[0],
+            metadata={"title": "🧠 Thought"}
+        )
+        if completion_started:
+            history[-1] = gr.ChatMessage(
+                role="assistant",
+                content=parts[1]
+            )
+        elif len(parts) > 1 and not completion_started:
+            completion_started = True
+            history.append(gr.ChatMessage(
+                role="assistant",
+                content=parts[1]
+            ))
+        # only yield the most recent assistant messages
+        messages_to_yield = history[-1:] if not completion_started else history[-2:]
+        yield messages_to_yield
 # Add the model display name and Hugging Face URL to the description
 # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
+print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})")
 gr.ChatInterface(
     chat_fn,
     title=title,
     description=description,
     theme=gr.themes.Default(primary_hue="green"),
+    type="messages",
 ).launch()