ZeroGPU-LLM-Inference

Running

Luigi commited on May 2

Commit

960db60

1 Parent(s): 889f080

fix: prevent self-talking issue by using tokenizer chat_template formatting

- Updated prompt construction to use `tokenizer.apply_chat_template()` for chat-tuned models
- Ensured backward compatibility with non-chat models by falling back to manual prompt formatting
- Resolves issue where models would echo both user and assistant messages due to incorrect flat-text prompts

Files changed (1) hide show

app.py +16 -18

app.py CHANGED Viewed

@@ -95,22 +95,21 @@ def retrieve_context(query, max_results=6, max_chars=600):
     except Exception:
         return []
-def format_conversation(history, system_prompt):
-    """
-    Flatten chat history and system prompt into a single string.
-    """
-    prompt = system_prompt.strip() + "\n"
-    for msg in history:
-        if msg['role'] == 'user':
-            prompt += "User: " + msg['content'].strip() + "\n"
-        elif msg['role'] == 'assistant':
-            prompt += "Assistant: " + msg['content'].strip() + "\n"
-        else:
-            prompt += msg['content'].strip() + "\n"
-    if not prompt.strip().endswith("Assistant:"):
-        prompt += "Assistant: "
-    return prompt
 @spaces.GPU(duration=60)
 def chat_response(user_msg, chat_history, system_prompt,
@@ -166,9 +165,8 @@ def chat_response(user_msg, chat_history, system_prompt,
         else:
             enriched = system_prompt
-        prompt = format_conversation(history, enriched)
         pipe = load_pipeline(model_name)
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)

     except Exception:
         return []
+def format_conversation(history, system_prompt, tokenizer):
+    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+        messages = [{"role": "system", "content": system_prompt.strip()}] + history
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    else:
+        # Fallback for base LMs without chat template
+        prompt = system_prompt.strip() + "\n"
+        for msg in history:
+            if msg['role'] == 'user':
+                prompt += "User: " + msg['content'].strip() + "\n"
+            elif msg['role'] == 'assistant':
+                prompt += "Assistant: " + msg['content'].strip() + "\n"
+        if not prompt.strip().endswith("Assistant:"):
+            prompt += "Assistant: "
+        return prompt
 @spaces.GPU(duration=60)
 def chat_response(user_msg, chat_history, system_prompt,
         else:
             enriched = system_prompt
         pipe = load_pipeline(model_name)
+        prompt = format_conversation(history, enriched, pipe.tokenizer)
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)