ZeroGPU-LLM-Inference

Sleeping

App Files Files Community

Luigi commited on Apr 10

Commit

9d3ca6c

1 Parent(s): 4522453

fix reasonning model's thought process display

Browse files

Files changed (1) hide show

app.py +69 -41

app.py CHANGED Viewed

@@ -6,11 +6,34 @@ import gc
 import shutil
 import re
-# Set a threshold for required free storage (in bytes) before downloading a new model.
-# Adjust this value according to the expected size of your models.
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
-# Available models
 MODELS = {
     "Qwen2.5-7B-Instruct (Q2_K)": {
         "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
@@ -49,7 +72,7 @@ MODELS = {
     },
 }
-# Sidebar for model selection and settings
 with st.sidebar:
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
@@ -78,11 +101,11 @@ with st.sidebar:
         except Exception as e:
             st.error(f"Disk usage error: {e}")
-# Model info
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
-# Initialize session state variables if not present
 if "model_name" not in st.session_state:
     st.session_state.model_name = None
 if "llm" not in st.session_state:
@@ -92,10 +115,10 @@ if "chat_history" not in st.session_state:
 if "pending_response" not in st.session_state:
     st.session_state.pending_response = False
-# Ensure model directory exists
 os.makedirs("models", exist_ok=True)
-# Function to clean up old models
 def cleanup_old_models():
     for f in os.listdir("models"):
         if f.endswith(".gguf") and f != selected_model["filename"]:
@@ -110,7 +133,7 @@ def download_model():
             repo_id=selected_model["repo_id"],
             filename=selected_model["filename"],
             local_dir="./models",
-            local_dir_use_symlinks=False,
         )
 def try_load_model(path):
@@ -130,9 +153,8 @@ def try_load_model(path):
         return str(e)
 def validate_or_download_model():
-    # Download model if it doesn't exist locally.
     if not os.path.exists(model_path):
-        # Check free space and cleanup old models only if free space is insufficient.
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage detected. Cleaning up old models to free up space.")
@@ -146,7 +168,6 @@ def validate_or_download_model():
             os.remove(model_path)
         except Exception:
             pass
-        # Check storage again before re-downloading.
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
@@ -159,7 +180,7 @@ def validate_or_download_model():
         return result
     return result
-# Load model if changed
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
         del st.session_state.llm
@@ -169,43 +190,38 @@ if st.session_state.model_name != selected_model_name:
 llm = st.session_state.llm
-# Display title and caption
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
-# Render the full chat history
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
-        # For assistant messages, if there's internal reasoning, display it behind an expander
         if chat.get("role") == "assistant" and chat.get("thinking"):
             with st.expander("🧠 Model's Internal Reasoning"):
                 for t in chat["thinking"]:
                     st.markdown(t.strip())
-# Chat input widget
 user_input = st.chat_input("Ask something...")
 if user_input:
-    # Block new input if a response is still pending
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
-        # Append and render the user's message
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
-        # Mark that we are waiting for a response
         st.session_state.pending_response = True
         MAX_TURNS = 8
-        # Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
-        # Create a container for the assistant's streaming message with two placeholders:
-        # one for visible output and one for the think part.
         with st.chat_message("assistant"):
             visible_placeholder = st.empty()
             thinking_placeholder = st.empty()
@@ -219,34 +235,46 @@ if user_input:
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
-            # Stream and update the assistant's message in real time
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
-                    # Update visible response by filtering out think parts
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
-                    # Extract and pretty format internal reasoning (if any) while streaming
-                    thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
-                    if thinking:
-                        thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
-                        thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
                     else:
                         thinking_placeholder.empty()
-            # After streaming completes, process the final full response:
-            visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
-            thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
             st.session_state.chat_history.append({
                 "role": "assistant",
-                "content": visible_response,
-                "thinking": thinking
             })
-            # Display the final internal reasoning behind an expander if available
-            if thinking:
-                with st.expander("🧠 Model's Internal Reasoning"):
-                    for t in thinking:
-                        st.markdown(t.strip())
-        # Clear the pending flag once done
         st.session_state.pending_response = False

 import shutil
 import re
+# ----- Custom CSS for pretty formatting of internal reasoning -----
+CUSTOM_CSS = """
+<style>
+/* Styles for the internal reasoning bullet list */
+ul.think-list {
+    margin: 0.5em 0 1em 1.5em;
+    padding: 0;
+    list-style-type: disc;
+}
+ul.think-list li {
+    margin-bottom: 0.5em;
+}
+/* Container style for the "in progress" internal reasoning */
+.chat-assistant {
+    background-color: #f9f9f9;
+    padding: 1em;
+    border-radius: 5px;
+    margin-bottom: 1em;
+}
+</style>
+"""
+st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
+# ----- Set a threshold for required free storage (in bytes) -----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
+# ----- Available models -----
 MODELS = {
     "Qwen2.5-7B-Instruct (Q2_K)": {
         "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
     },
 }
+# ----- Sidebar settings -----
 with st.sidebar:
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
         except Exception as e:
             st.error(f"Disk usage error: {e}")
+# ----- Model info -----
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
+# ----- Session state initialization -----
 if "model_name" not in st.session_state:
     st.session_state.model_name = None
 if "llm" not in st.session_state:
 if "pending_response" not in st.session_state:
     st.session_state.pending_response = False
+# ----- Ensure model directory exists -----
 os.makedirs("models", exist_ok=True)
+# ----- Functions for model management -----
 def cleanup_old_models():
     for f in os.listdir("models"):
         if f.endswith(".gguf") and f != selected_model["filename"]:
             repo_id=selected_model["repo_id"],
             filename=selected_model["filename"],
             local_dir="./models",
+            local_dir_use_symlinks=False,  # Deprecated parameter; harmless warning.
         )
 def try_load_model(path):
         return str(e)
 def validate_or_download_model():
+    # Download model if not present locally.
     if not os.path.exists(model_path):
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage detected. Cleaning up old models to free up space.")
             os.remove(model_path)
         except Exception:
             pass
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
         return result
     return result
+# ----- Load model if changed -----
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
         del st.session_state.llm
 llm = st.session_state.llm
+# ----- Display title and caption -----
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
+# ----- Render full chat history -----
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
+        # For assistant messages, if there's completed internal reasoning, display it behind an expander.
         if chat.get("role") == "assistant" and chat.get("thinking"):
             with st.expander("🧠 Model's Internal Reasoning"):
                 for t in chat["thinking"]:
                     st.markdown(t.strip())
+# ----- Chat input widget -----
 user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         st.session_state.pending_response = True
         MAX_TURNS = 8
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
+        # ----- Streaming the assistant response -----
         with st.chat_message("assistant"):
             visible_placeholder = st.empty()
             thinking_placeholder = st.empty()
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
+                    # Determine if there is an open (in-progress) <think> block
+                    open_think = re.search(r"<think>([^<]*)$", full_response, flags=re.DOTALL)
+                    in_progress = open_think.group(1).strip() if open_think else ""
+                    # Create the visible response by removing any complete <think>...</think> blocks,
+                    # and also removing any in-progress (unclosed) <think> content.
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
+                    visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
+                    # If there's an in-progress thinking part, display it in a pretty style
+                    if in_progress:
+                        # You can further format in_progress as you like; here we wrap it in a styled div.
+                        thinking_html = f"""
+                        <div class="chat-assistant">
+                            <strong>Internal Reasoning (in progress):</strong>
+                            <br>{in_progress}
+                        </div>
+                        """
+                        thinking_placeholder.markdown(thinking_html, unsafe_allow_html=True)
                     else:
                         thinking_placeholder.empty()
+            # After streaming completes:
+            # Extract all completed <think> blocks (the final internal reasoning that was closed)
+            final_thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
+            # The final visible response: remove any <think> blocks or any in-progress open block.
+            final_visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
+            final_visible = re.sub(r"<think>.*$", "", final_visible, flags=re.DOTALL)
             st.session_state.chat_history.append({
                 "role": "assistant",
+                "content": final_visible,
+                "thinking": final_thinking
             })
         st.session_state.pending_response = False