ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 10

Commit

06a162a

1 Parent(s): 4e60755

fix role disorder error in history

Browse files

Files changed (1) hide show

app.py +25 -17

app.py CHANGED Viewed

@@ -216,14 +216,16 @@ if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
-        # Append the user query to chat history
-        st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         st.session_state.pending_response = True
-        # Only retrieve search context if search feature is enabled
         if enable_search:
             retrieved_context = retrieve_context(user_input, max_results=2, max_chars_per_result=150)
         else:
@@ -231,20 +233,26 @@ if user_input:
         st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
         st.sidebar.text(retrieved_context or "No context found.")
-        # Build an augmented system prompt that includes the retrieved context if available
-        if retrieved_context:
-            augmented_prompt = (
-                "Use the following recent web search context to help answer the query:\n\n"
-                f"{retrieved_context}\n\nUser Query: {user_input}"
             )
         else:
-            augmented_prompt = f"User Query: {user_input}"
-        full_system_prompt = system_prompt_base.strip() + "\n\n" + augmented_prompt
-        # Limit conversation history to the last 2 turns
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
-        messages = [{"role": "system", "content": full_system_prompt}] + trimmed_history
         # Generate response with the LLM in a streaming fashion
         with st.chat_message("assistant"):
@@ -259,7 +267,6 @@ if user_input:
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
@@ -268,7 +275,8 @@ if user_input:
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
                     visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
         st.session_state.chat_history.append({"role": "assistant", "content": full_response})
         st.session_state.pending_response = False
-        gc.collect()  # Trigger garbage collection to free memory

     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
+        # Display the raw user input immediately in the chat view.
         with st.chat_message("user"):
             st.markdown(user_input)
+        # Append the plain user message to chat history for display purposes.
+        # (We will later override the last user message in the API call with the augmented version.)
+        st.session_state.chat_history.append({"role": "user", "content": user_input})
         st.session_state.pending_response = True
+        # Retrieve extra context from web search if enabled
         if enable_search:
             retrieved_context = retrieve_context(user_input, max_results=2, max_chars_per_result=150)
         else:
         st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
         st.sidebar.text(retrieved_context or "No context found.")
+        # Build an augmented user query by merging the system prompt (and search context when available)
+        if enable_search and retrieved_context:
+            augmented_user_input = (
+                f"{system_prompt_base.strip()}\n\n"
+                f"Use the following recent web search context to help answer the query:\n\n"
+                f"{retrieved_context}\n\n"
+                f"User Query: {user_input}"
             )
         else:
+            augmented_user_input = f"{system_prompt_base.strip()}\n\nUser Query: {user_input}"
+        # Limit conversation history to the last MAX_TURNS turns (user/assistant pairs)
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
+        # Replace the last user message (which is plain) with the augmented version for model input.
+        if trimmed_history and trimmed_history[-1]["role"] == "user":
+            messages = trimmed_history[:-1] + [{"role": "user", "content": augmented_user_input}]
+        else:
+            messages = trimmed_history + [{"role": "user", "content": augmented_user_input}]
         # Generate response with the LLM in a streaming fashion
         with st.chat_message("assistant"):
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
                     visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
+        # Append the assistant's response to conversation history.
         st.session_state.chat_history.append({"role": "assistant", "content": full_response})
         st.session_state.pending_response = False
+        gc.collect()  # Free memory