Spaces:

ServiceNow-AI
/

Apriel-Chat

Running

App Files Files Community

bradnow commited on Sep 26

Commit

5580f46

1 Parent(s): 5761cb7

Update to handle Apriel-1.5-15b format and multiple endpoints list

Browse files

Files changed (2) hide show

app.py +33 -12
utils.py +28 -14

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import datetime
 from uuid import uuid4
 from openai import OpenAI
@@ -6,14 +6,15 @@ import gradio as gr
 from theme import apriel
 from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
-    logged_event_handler, DEBUG_MODEL, log_debug, log_info, log_error
 from log_chat import log_chat
 MODEL_TEMPERATURE = 0.8
 BUTTON_WIDTH = 160
-DEFAULT_OPT_OUT_VALUE = False
-DEFAULT_MODEL_NAME = "Apriel-Nemotron-15b-Thinker" if not DEBUG_MODEL else "Apriel-5b"
 BUTTON_ENABLED = gr.update(interactive=True)
 BUTTON_DISABLED = gr.update(interactive=False)
@@ -31,6 +32,9 @@ chat_start_count = 0
 model_config = {}
 openai_client = None
 def app_loaded(state, request: gr.Request):
     message_html = setup_model(DEFAULT_MODEL_NAME, intial=False)
@@ -45,21 +49,28 @@ def update_model_and_clear_chat(model_name):
     return desc, []
-def setup_model(model_name, intial=False):
-    global model_config, openai_client
-    model_config = get_model_config(model_name)
     log_debug(f"update_model() --> Model config: {model_config}")
     openai_client = OpenAI(
         api_key=model_config.get('AUTH_TOKEN'),
-        base_url=model_config.get('VLLM_API_URL')
     )
     _model_hf_name = model_config.get("MODEL_HF_URL").split('https://huggingface.co/')[1]
     _link = f"<a href='{model_config.get('MODEL_HF_URL')}{COMMUNITY_POSTFIX_URL}' target='_blank'>{_model_hf_name}</a>"
     _description = f"We'd love to hear your thoughts on the model. Click here to provide feedback - {_link}"
-    log_debug(f"Switched to model {_model_hf_name}")
     if intial:
         return
     else:
@@ -95,6 +106,9 @@ def run_chat_inference(history, message, state):
     error = None
     model_name = model_config.get('MODEL_NAME')
     if len(history) == 0:
         state["chat_id"] = uuid4().hex
@@ -187,8 +201,9 @@ def run_chat_inference(history, message, state):
                 log_debug(f"chat_fn() --> Stopping streaming...")
                 break  # Exit the loop if the stop flag is set
             # Extract the new content from the delta field
-            content = getattr(chunk.choices[0].delta, "content", "")
-            output += content
             if is_reasoning:
                 parts = output.split("[BEGIN FINAL RESPONSE]")
@@ -198,8 +213,12 @@ def run_chat_inference(history, message, state):
                         parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
                     if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"):
                         parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "")
                     if parts[1].endswith("<|end|>"):
                         parts[1] = parts[1].replace("<|end|>", "")
                 history[-1 if not completion_started else -2] = gr.ChatMessage(
                     role="assistant",
@@ -220,6 +239,8 @@ def run_chat_inference(history, message, state):
             else:
                 if output.endswith("<|end|>"):
                     output = output.replace("<|end|>", "")
                 history[-1] = gr.ChatMessage(
                     role="assistant",
                     content=output

+import random
 from uuid import uuid4
 from openai import OpenAI
 from theme import apriel
 from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
+    logged_event_handler, DEBUG_MODE, DEBUG_MODEL, log_debug, log_info, log_error
 from log_chat import log_chat
 MODEL_TEMPERATURE = 0.8
 BUTTON_WIDTH = 160
+DEFAULT_OPT_OUT_VALUE = DEBUG_MODE
+# If DEBUG_MODEL is True, use an alternative model (without reasoning) for testing
+DEFAULT_MODEL_NAME = "Apriel-1.5-15B-thinker" if not DEBUG_MODEL else "Apriel-1.5-15B-thinker"  # "Apriel-5b"
 BUTTON_ENABLED = gr.update(interactive=True)
 BUTTON_DISABLED = gr.update(interactive=False)
 model_config = {}
 openai_client = None
+USE_RANDOM_ENDPOINT = False
+endpoint_rotation_count = 0
 def app_loaded(state, request: gr.Request):
     message_html = setup_model(DEFAULT_MODEL_NAME, intial=False)
     return desc, []
+def setup_model(model_key, intial=False):
+    global model_config, openai_client, endpoint_rotation_count
+    model_config = get_model_config(model_key)
     log_debug(f"update_model() --> Model config: {model_config}")
+    url_list = (model_config.get('VLLM_API_URL_LIST') or "").split(",")
+    if USE_RANDOM_ENDPOINT:
+        base_url = random.choice(url_list) if len(url_list) > 0 else model_config.get('VLLM_API_URL')
+    else:
+        base_url = url_list[endpoint_rotation_count % len(url_list)]
+        endpoint_rotation_count += 1
     openai_client = OpenAI(
         api_key=model_config.get('AUTH_TOKEN'),
+        base_url=base_url
     )
+    log_debug(f"Switched to model {model_key} using endpoint {base_url}")
     _model_hf_name = model_config.get("MODEL_HF_URL").split('https://huggingface.co/')[1]
     _link = f"<a href='{model_config.get('MODEL_HF_URL')}{COMMUNITY_POSTFIX_URL}' target='_blank'>{_model_hf_name}</a>"
     _description = f"We'd love to hear your thoughts on the model. Click here to provide feedback - {_link}"
     if intial:
         return
     else:
     error = None
     model_name = model_config.get('MODEL_NAME')
+    # Reinitialize the OpenAI client with a random endpoint from the list
+    setup_model(model_config.get('MODEL_KEY'))
     if len(history) == 0:
         state["chat_id"] = uuid4().hex
                 log_debug(f"chat_fn() --> Stopping streaming...")
                 break  # Exit the loop if the stop flag is set
             # Extract the new content from the delta field
+            content = getattr(chunk.choices[0].delta, "content", "") or ""
+            reasoning_content = getattr(chunk.choices[0].delta, "reasoning_content", "") or ""
+            output += reasoning_content + content
             if is_reasoning:
                 parts = output.split("[BEGIN FINAL RESPONSE]")
                         parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
                     if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"):
                         parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "")
+                    if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>\n"):
+                        parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>\n", "")
                     if parts[1].endswith("<|end|>"):
                         parts[1] = parts[1].replace("<|end|>", "")
+                    if parts[1].endswith("<|end|>\n"):
+                        parts[1] = parts[1].replace("<|end|>\n", "")
                 history[-1 if not completion_started else -2] = gr.ChatMessage(
                     role="assistant",
             else:
                 if output.endswith("<|end|>"):
                     output = output.replace("<|end|>", "")
+                if output.endswith("<|end|>\n"):
+                    output = output.replace("<|end|>\n", "")
                 history[-1] = gr.ChatMessage(
                     role="assistant",
                     content=output

utils.py CHANGED Viewed

@@ -12,27 +12,41 @@ DEBUG_MODE = False or os.environ.get("DEBUG_MODE") == "True"
 DEBUG_MODEL = False or os.environ.get("DEBUG_MODEL") == "True"
 models_config = {
-    "Apriel-Nemotron-15b-Thinker": {
-        "MODEL_DISPLAY_NAME": "Apriel-Nemotron-15b-Thinker",
-        "MODEL_HF_URL": "https://huggingface.co/ServiceNow-AI/Apriel-Nemotron-15b-Thinker",
-        "MODEL_NAME": os.environ.get("MODEL_NAME_NEMO_15B"),
-        "VLLM_API_URL": os.environ.get("VLLM_API_URL_NEMO_15B"),
         "AUTH_TOKEN": os.environ.get("AUTH_TOKEN"),
-        "REASONING": True
     },
-    "Apriel-5b": {
-        "MODEL_DISPLAY_NAME": "Apriel-5b",
-        "MODEL_HF_URL": "https://huggingface.co/ServiceNow-AI/Apriel-5B-Instruct",
-        "MODEL_NAME": os.environ.get("MODEL_NAME_5B"),
-        "VLLM_API_URL": os.environ.get("VLLM_API_URL_5B"),
-        "AUTH_TOKEN": os.environ.get("AUTH_TOKEN"),
-        "REASONING": False
-    }
 }
 def get_model_config(model_name: str) -> dict:
     config = models_config.get(model_name)
     if not config:
         raise ValueError(f"Model {model_name} not found in models_config")
     if not config.get("MODEL_NAME"):

 DEBUG_MODEL = False or os.environ.get("DEBUG_MODEL") == "True"
 models_config = {
+    "Apriel-1.5-15B-thinker": {
+        "MODEL_DISPLAY_NAME": "Apriel-1.5-15B-thinker",
+        "MODEL_HF_URL": "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker",
+        "MODEL_NAME": os.environ.get("MODEL_NAME_APRIEL_1_5_15B"),
+        "VLLM_API_URL": os.environ.get("VLLM_API_URL_APRIEL_1_5_15B"),
+        "VLLM_API_URL_LIST": os.environ.get("VLLM_API_URL_LIST_APRIEL_1_5_15B"),
         "AUTH_TOKEN": os.environ.get("AUTH_TOKEN"),
+        "REASONING": True,
+        "MULTIMODAL": True
     },
+    # "Apriel-Nemotron-15b-Thinker": {
+    #     "MODEL_DISPLAY_NAME": "Apriel-Nemotron-15b-Thinker",
+    #     "MODEL_HF_URL": "https://huggingface.co/ServiceNow-AI/Apriel-Nemotron-15b-Thinker",
+    #     "MODEL_NAME": os.environ.get("MODEL_NAME_NEMO_15B"),
+    #     "VLLM_API_URL": os.environ.get("VLLM_API_URL_NEMO_15B"),
+    #     "AUTH_TOKEN": os.environ.get("AUTH_TOKEN"),
+    #     "REASONING": True,
+    #     "MULTIMODAL": False
+    # },
+    # "Apriel-5b": {
+    #     "MODEL_DISPLAY_NAME": "Apriel-5b",
+    #     "MODEL_HF_URL": "https://huggingface.co/ServiceNow-AI/Apriel-5B-Instruct",
+    #     "MODEL_NAME": os.environ.get("MODEL_NAME_5B"),
+    #     "VLLM_API_URL": os.environ.get("VLLM_API_URL_5B"),
+    #     "AUTH_TOKEN": os.environ.get("AUTH_TOKEN"),
+    #     "REASONING": False,
+    #     "MULTIMODAL": False
+    # }
 }
 def get_model_config(model_name: str) -> dict:
     config = models_config.get(model_name)
+    config['MODEL_KEY'] = model_name
     if not config:
         raise ValueError(f"Model {model_name} not found in models_config")
     if not config.get("MODEL_NAME"):