ChatMCP

Sleeping

App Files Files Community

Nymbo commited on 22 days ago

Commit

6fe806a

verified ·

1 Parent(s): 3fbe0e2

Update chat_handler.py

Browse files

Files changed (1) hide show

chat_handler.py +446 -45

chat_handler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Chat handling logic for Universal MCP Client - Fixed Version with File Upload Support
 """
 import re
 import logging
 import traceback
@@ -188,6 +189,7 @@ class ChatHandler:
         recent_history = history[-max_history:] if len(history) > max_history else history
         last_role = None
         for msg in recent_history:
             # Handle both ChatMessage objects and dictionary format for backward compatibility
             if hasattr(msg, 'role'):  # ChatMessage object
@@ -200,31 +202,52 @@ class ChatHandler:
                 continue  # Skip invalid messages
             if role == "user":
-                # Build multimodal user messages with parts
-                part = None
-                if isinstance(content, dict) and "path" in content:
-                    file_path = content.get("path", "")
-                    if isinstance(file_path, str) and file_path.startswith("http") and AppConfig.is_image_file(file_path):
-                        part = {"type": "image_url", "image_url": {"url": file_path}}
                     else:
-                        # Non-image or non-URL: fallback to text description
-                        part = {"type": "text", "text": f"[File: {file_path}]"}
-                elif isinstance(content, (list, tuple)):
-                    part = {"type": "text", "text": f"[List: {str(content)[:50]}...]"}
-                elif content is None:
-                    part = {"type": "text", "text": "[Empty]"}
-                else:
-                    part = {"type": "text", "text": str(content)}
-                if messages and last_role == "user" and isinstance(messages[-1].get("content"), list):
-                    messages[-1]["content"].append(part)
-                elif messages and last_role == "user" and isinstance(messages[-1].get("content"), str):
-                    # Convert existing string content to parts and append
-                    existing_text = messages[-1]["content"]
-                    messages[-1]["content"] = [{"type": "text", "text": existing_text}, part]
                 else:
-                    messages.append({"role": "user", "content": [part]})
-                last_role = "user"
             elif role == "assistant":
                 # Assistant content remains text for chat.completions API
@@ -252,40 +275,54 @@ class ChatHandler:
             return self._call_hf_with_mcp(messages, uploaded_file_urls)
     def _call_hf_without_mcp(self, messages: List[Dict[str, Any]]) -> List[ChatMessage]:
-        """Call HF Inference API without MCP servers"""
-        logger.info("💬 No MCP servers available, using regular HF Inference chat")
         system_prompt = self._get_native_system_prompt()
         # Add system prompt to messages
         if messages and messages[0].get("role") == "system":
             messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
         else:
             messages.insert(0, {"role": "system", "content": system_prompt})
         # Get optimal token settings
         if self.mcp_client.current_model and self.mcp_client.current_provider:
             context_settings = AppConfig.get_optimal_context_settings(
-                self.mcp_client.current_model,
                 self.mcp_client.current_provider,
                 0  # No MCP servers
             )
             max_tokens = context_settings['max_response_tokens']
         else:
             max_tokens = 8192
-        # Use HF Inference API
         try:
-            response = self.mcp_client.generate_chat_completion(messages, **{"max_tokens": max_tokens})
-            response_text = response.choices[0].message.content
-            if not response_text:
-                response_text = "I understand your request and I'm here to help."
-            return [ChatMessage(role="assistant", content=response_text)]
         except Exception as e:
-            logger.error(f"HF Inference API call failed: {e}")
-            return [ChatMessage(role="assistant", content=f"❌ API call failed: {str(e)}")]
     def _call_hf_with_mcp(self, messages: List[Dict[str, Any]], uploaded_file_urls: List[str] = None) -> List[ChatMessage]:
         """Call HF Inference API with MCP servers and return structured responses"""
@@ -433,13 +470,13 @@ class ChatHandler:
                     # Tool execution failed
                     error_details = tool_info['result']
-                    # Create main tool message with error status
                     chat_messages.append(ChatMessage(
                         role="assistant",
                         content="",
                         metadata={
                             "title": f"❌ Used {tool_info['tool']}",
-                            "status": "error",
                             "duration": duration,
                             "id": tool_id
                         }
@@ -452,7 +489,7 @@ class ChatHandler:
                         metadata={
                             "title": "📊 Server Response",
                             "parent_id": tool_id,
-                            "status": "error"
                         }
                     ))
@@ -463,7 +500,7 @@ class ChatHandler:
                         metadata={
                             "title": "💡 Possible Solutions",
                             "parent_id": tool_id,
-                            "status": "info"
                         }
                     ))
             else:
@@ -482,6 +519,370 @@ class ChatHandler:
             ))
         return chat_messages
     def _extract_media_url(self, result_text: str, server_name: str) -> Optional[str]:
         """Extract media URL from MCP response with improved pattern matching"""
@@ -644,4 +1045,4 @@ IMPORTANT NOTES:
 - ALWAYS provide a descriptive message before the JSON tool call
 - After tool execution, you can provide additional context or ask if the user needs anything else
 Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-Current model: {self.mcp_client.current_model} via {self.mcp_client.current_provider}"""

 """
 Chat handling logic for Universal MCP Client - Fixed Version with File Upload Support
 """
+import asyncio
 import re
 import logging
 import traceback
         recent_history = history[-max_history:] if len(history) > max_history else history
         last_role = None
+        is_gpt_oss = AppConfig.is_gpt_oss_model(self.mcp_client.current_model) if self.mcp_client.current_model else False
         for msg in recent_history:
             # Handle both ChatMessage objects and dictionary format for backward compatibility
             if hasattr(msg, 'role'):  # ChatMessage object
                 continue  # Skip invalid messages
             if role == "user":
+                if is_gpt_oss:
+                    # Text-only content for GPT-OSS (no multimodal parts)
+                    if isinstance(content, dict) and "path" in content:
+                        file_path = content.get("path", "")
+                        # Omit media content; optionally note the upload as text
+                        text_piece = ""
+                        # Choose to ignore media fully to avoid confusing the model
+                    elif isinstance(content, (list, tuple)):
+                        text_piece = f"[List: {str(content)[:50]}...]"
+                    elif content is None:
+                        text_piece = "[Empty]"
                     else:
+                        text_piece = str(content)
+                    if messages and last_role == "user" and isinstance(messages[-1].get("content"), str):
+                        # Concatenate text
+                        if text_piece:
+                            messages[-1]["content"] = (messages[-1]["content"] + "\n" + text_piece) if messages[-1]["content"] else text_piece
+                    else:
+                        messages.append({"role": "user", "content": text_piece})
+                    last_role = "user"
                 else:
+                    # Build multimodal user messages with parts (for non-GPT-OSS)
+                    part = None
+                    if isinstance(content, dict) and "path" in content:
+                        file_path = content.get("path", "")
+                        if isinstance(file_path, str) and file_path.startswith("http") and AppConfig.is_image_file(file_path):
+                            part = {"type": "image_url", "image_url": {"url": file_path}}
+                        else:
+                            part = {"type": "text", "text": f"[File: {file_path}]"}
+                    elif isinstance(content, (list, tuple)):
+                        part = {"type": "text", "text": f"[List: {str(content)[:50]}...]"}
+                    elif content is None:
+                        part = {"type": "text", "text": "[Empty]"}
+                    else:
+                        part = {"type": "text", "text": str(content)}
+                    if messages and last_role == "user" and isinstance(messages[-1].get("content"), list):
+                        messages[-1]["content"].append(part)
+                    elif messages and last_role == "user" and isinstance(messages[-1].get("content"), str):
+                        # Convert existing string content to parts and append
+                        existing_text = messages[-1]["content"]
+                        messages[-1]["content"] = [{"type": "text", "text": existing_text}, part]
+                    else:
+                        messages.append({"role": "user", "content": [part]})
+                    last_role = "user"
             elif role == "assistant":
                 # Assistant content remains text for chat.completions API
             return self._call_hf_with_mcp(messages, uploaded_file_urls)
     def _call_hf_without_mcp(self, messages: List[Dict[str, Any]]) -> List[ChatMessage]:
+        """Call HF Inference API without MCP servers. Streams tokens for faster feedback."""
+        logger.info("💬 No MCP servers available, using streaming HF Inference chat when possible")
         system_prompt = self._get_native_system_prompt()
         # Add system prompt to messages
         if messages and messages[0].get("role") == "system":
             messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
         else:
             messages.insert(0, {"role": "system", "content": system_prompt})
         # Get optimal token settings
         if self.mcp_client.current_model and self.mcp_client.current_provider:
             context_settings = AppConfig.get_optimal_context_settings(
+                self.mcp_client.current_model,
                 self.mcp_client.current_provider,
                 0  # No MCP servers
             )
             max_tokens = context_settings['max_response_tokens']
         else:
             max_tokens = 8192
+        # Try streaming first; fall back to non-streaming on error
         try:
+            stream = self.mcp_client.generate_chat_completion_stream(messages, **{"max_tokens": max_tokens})
+            accumulated = ""
+            for chunk in stream:
+                try:
+                    delta = chunk.choices[0].delta.content or ""
+                except Exception:
+                    # Some SDK variants stream as message deltas differently
+                    delta = getattr(getattr(chunk.choices[0], "delta", None), "content", "") or ""
+                if delta:
+                    accumulated += delta
+            if not accumulated:
+                accumulated = "I understand your request and I'm here to help."
+            return [ChatMessage(role="assistant", content=accumulated)]
         except Exception as e:
+            logger.warning(f"Streaming failed, retrying without stream: {e}")
+            try:
+                response = self.mcp_client.generate_chat_completion(messages, **{"max_tokens": max_tokens})
+                response_text = response.choices[0].message.content
+                if not response_text:
+                    response_text = "I understand your request and I'm here to help."
+                return [ChatMessage(role="assistant", content=response_text)]
+            except Exception as e2:
+                logger.error(f"HF Inference API call failed: {e2}")
+                return [ChatMessage(role="assistant", content=f"❌ API call failed: {str(e2)}")]
     def _call_hf_with_mcp(self, messages: List[Dict[str, Any]], uploaded_file_urls: List[str] = None) -> List[ChatMessage]:
         """Call HF Inference API with MCP servers and return structured responses"""
                     # Tool execution failed
                     error_details = tool_info['result']
+                    # Create main tool message with pending status (error reflected in content)
                     chat_messages.append(ChatMessage(
                         role="assistant",
                         content="",
                         metadata={
                             "title": f"❌ Used {tool_info['tool']}",
+                            "status": "pending",
                             "duration": duration,
                             "id": tool_id
                         }
                         metadata={
                             "title": "📊 Server Response",
                             "parent_id": tool_id,
+                            "status": "done"
                         }
                     ))
                         metadata={
                             "title": "💡 Possible Solutions",
                             "parent_id": tool_id,
+                            "status": "done"
                         }
                     ))
             else:
             ))
         return chat_messages
+    def process_multimodal_message_stream(self, message: Dict[str, Any], history: List):
+        """Generator that streams assistant output to the UI as it arrives.
+        - Streams for plain LLM chats
+        - Streams initial planning/tool JSON for MCP flows, executes tool, then streams final answer
+        - Attempts to surface reasoning/thinking traces when available
+        """
+        try:
+            # Pre-checks
+            if not self.mcp_client.hf_client:
+                error_msg = "❌ HuggingFace token not configured. Please set HF_TOKEN environment variable or login."
+                history.append(ChatMessage(role="assistant", content=error_msg))
+                yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                return
+            if not self.mcp_client.current_provider or not self.mcp_client.current_model:
+                error_msg = "❌ Please select an inference provider and model first."
+                history.append(ChatMessage(role="assistant", content=error_msg))
+                yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                return
+            # Parse user input
+            user_text = message.get("text", "") if message else ""
+            user_files = message.get("files", []) if message else []
+            # Upload files and update history similarly to non-stream path
+            self.file_url_mapping = {}
+            uploaded_file_urls: List[str] = []
+            if isinstance(message, str):
+                user_text = message
+                user_files = []
+            if user_files:
+                for file_path in user_files:
+                    try:
+                        uploaded_url = self._upload_file_to_gradio_server(file_path)
+                        self.file_url_mapping[file_path] = uploaded_url
+                        uploaded_file_urls.append(uploaded_url)
+                        history.append(ChatMessage(role="user", content={"path": uploaded_url}))
+                    except Exception:
+                        history.append(ChatMessage(role="user", content={"path": file_path}))
+            if user_text and user_text.strip():
+                history.append(ChatMessage(role="user", content=user_text))
+            if not user_text.strip() and not user_files:
+                yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                return
+            # Prepare messages for HF
+            messages = self._prepare_hf_messages(history, uploaded_file_urls)
+            # Choose streaming path based on MCP servers
+            if self.mcp_client.get_enabled_servers():
+                # Stream with MCP planning/tool execution
+                yield from self._stream_with_mcp(messages, uploaded_file_urls, history)
+            else:
+                # Plain LLM streaming with optional thinking trace
+                yield from self._stream_without_mcp(messages, history)
+        except Exception as e:
+            history.append(ChatMessage(role="assistant", content=f"❌ Error: {str(e)}"))
+            yield history, gr.MultimodalTextbox(value=None, interactive=True)
+    def _stream_without_mcp(self, messages: List[Dict[str, Any]], history: List):
+        """Stream tokens for plain LLM chats; attempts to surface reasoning traces if available."""
+        # Add system prompt
+        system_prompt = self._get_native_system_prompt()
+        if messages and messages[0].get("role") == "system":
+            messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
+        else:
+            messages.insert(0, {"role": "system", "content": system_prompt})
+        # Compute max tokens
+        if self.mcp_client.current_model and self.mcp_client.current_provider:
+            ctx = AppConfig.get_optimal_context_settings(
+                self.mcp_client.current_model, self.mcp_client.current_provider, 0
+            )
+            max_tokens = ctx["max_response_tokens"]
+        else:
+            max_tokens = 8192
+        # Insert placeholders: optional thinking + main assistant
+        thinking_index = None
+        # Prepare a thinking message only when we actually receive thinking tokens
+        history.append(ChatMessage(role="assistant", content=""))
+        main_index = len(history) - 1
+        yield history, gr.MultimodalTextbox(value=None, interactive=False)
+        accumulated = ""
+        thinking_accum = ""
+        try:
+            stream = self.mcp_client.generate_chat_completion_stream(messages, **{"max_tokens": max_tokens})
+            for chunk in stream:
+                delta = getattr(chunk.choices[0], "delta", None)
+                # Reasoning/thinking traces (best-effort extraction)
+                reason_delta = None
+                if delta is not None:
+                    # Some providers expose .reasoning or .thinking
+                    reason_delta = (
+                        getattr(delta, "reasoning", None)
+                        or getattr(delta, "thinking", None)
+                    )
+                if reason_delta:
+                    thinking_accum += str(reason_delta)
+                    if thinking_index is None:
+                        history.insert(main_index, ChatMessage(
+                            role="assistant",
+                            content=f"{thinking_accum}",
+                            metadata={"title": "🧠 Reasoning", "status": "pending"}
+                        ))
+                        thinking_index = main_index
+                        main_index += 1
+                    else:
+                        history[thinking_index] = ChatMessage(
+                            role="assistant",
+                            content=f"{thinking_accum}",
+                            metadata={"title": "🧠 Reasoning", "status": "pending"}
+                        )
+                # Main content
+                delta_text = ""
+                try:
+                    delta_text = delta.content or ""
+                except Exception:
+                    delta_text = getattr(delta, "content", "") or ""
+                if not delta_text:
+                    yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                    continue
+                accumulated += delta_text
+                history[main_index] = ChatMessage(role="assistant", content=accumulated)
+                yield history, gr.MultimodalTextbox(value=None, interactive=False)
+        except Exception as e:
+            # Fallback to non-stream
+            try:
+                resp = self.mcp_client.generate_chat_completion(messages, **{"max_tokens": max_tokens})
+                final_text = resp.choices[0].message.content or "I understand your request and I'm here to help."
+                history[main_index] = ChatMessage(role="assistant", content=final_text)
+                yield history, gr.MultimodalTextbox(value=None, interactive=True)
+                return
+            except Exception as e2:
+                history[main_index] = ChatMessage(role="assistant", content=f"❌ API call failed: {str(e2)}")
+                yield history, gr.MultimodalTextbox(value=None, interactive=True)
+                return
+        # Final yield
+        yield history, gr.MultimodalTextbox(value=None, interactive=True)
+    def _stream_with_mcp(self, messages: List[Dict[str, Any]], uploaded_file_urls: List[str], history: List):
+        """Stream initial planning/tool JSON, execute MCP tool, then stream final response."""
+        # Enhanced system prompt with MCP guidance
+        system_prompt = self._get_mcp_system_prompt(uploaded_file_urls)
+        if messages and messages[0].get("role") == "system":
+            messages[0]["content"] = system_prompt + "\n\n" + messages[0]["content"]
+        else:
+            messages.insert(0, {"role": "system", "content": system_prompt})
+        # Compute max tokens taking enabled servers into account
+        enabled_servers = self.mcp_client.get_enabled_servers()
+        if self.mcp_client.current_model and self.mcp_client.current_provider:
+            ctx = AppConfig.get_optimal_context_settings(
+                self.mcp_client.current_model, self.mcp_client.current_provider, len(enabled_servers)
+            )
+            max_tokens = ctx["max_response_tokens"]
+        else:
+            max_tokens = 8192
+        # Placeholders: planning/tool JSON + main assistant
+        planning_index = None
+        thinking_index = None
+        history.append(ChatMessage(role="assistant", content=""))
+        main_index = len(history) - 1
+        yield history, gr.MultimodalTextbox(value=None, interactive=False)
+        text_accum = ""
+        tool_json_accum = ""
+        in_tool_json = False
+        tool_json_detected = False
+        try:
+            stream = self.mcp_client.generate_chat_completion_stream(messages, **{"max_tokens": max_tokens})
+            for chunk in stream:
+                delta = getattr(chunk.choices[0], "delta", None)
+                # Optional reasoning
+                reason_delta = None
+                if delta is not None:
+                    reason_delta = (
+                        getattr(delta, "reasoning", None)
+                        or getattr(delta, "thinking", None)
+                    )
+                if reason_delta:
+                    if thinking_index is None:
+                        history.insert(main_index, ChatMessage(
+                            role="assistant",
+                            content=str(reason_delta),
+                            metadata={"title": "🧠 Reasoning", "status": "pending"}
+                        ))
+                        thinking_index = main_index
+                        main_index += 1
+                    else:
+                        history[thinking_index] = ChatMessage(
+                            role="assistant",
+                            content=(history[thinking_index].content + str(reason_delta)),
+                            metadata={"title": "🧠 Reasoning", "status": "pending"}
+                        )
+                # Main content streaming and tool JSON detection (content-based JSON protocol)
+                piece = ""
+                try:
+                    piece = delta.content or ""
+                except Exception:
+                    piece = getattr(delta, "content", "") or ""
+                if not piece:
+                    yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                    continue
+                # Detect start of tool JSON
+                if not tool_json_detected and '{"use_tool":' in piece:
+                    in_tool_json = True
+                    tool_json_detected = True
+                if in_tool_json:
+                    tool_json_accum += piece
+                    # Initialize planning message
+                    if planning_index is None:
+                        history.insert(main_index, ChatMessage(
+                            role="assistant",
+                            content=tool_json_accum,
+                            metadata={"title": "🔧 Tool call (planning)", "status": "pending"}
+                        ))
+                        planning_index = main_index
+                        main_index += 1
+                    else:
+                        history[planning_index] = ChatMessage(
+                            role="assistant",
+                            content=tool_json_accum,
+                            metadata={"title": "🔧 Tool call (planning)", "status": "pending"}
+                        )
+                    # Try to reconstruct JSON when braces close
+                    reconstructed = self.mcp_client._reconstruct_json_from_start(tool_json_accum)
+                    if reconstructed:
+                        # We have a complete JSON
+                        in_tool_json = False
+                        # Clean planning content to the reconstructed JSON (for clarity)
+                        history[planning_index] = ChatMessage(
+                            role="assistant",
+                            content=reconstructed,
+                            metadata={"title": "🔧 Tool call", "status": "done"}
+                        )
+                        yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                        # Execute tool now
+                        import json as _json
+                        try:
+                            tool_req = _json.loads(reconstructed)
+                        except Exception:
+                            tool_req = None
+                        if tool_req and tool_req.get("use_tool"):
+                            server_name = tool_req.get("server")
+                            tool_name = tool_req.get("tool")
+                            arguments = tool_req.get("arguments", {})
+                            # Status message
+                            exec_msg = ChatMessage(
+                                role="assistant",
+                                content=f"Executing {tool_name} on {server_name}…",
+                                metadata={"title": "🔧 Tool execution", "status": "pending"}
+                            )
+                            history.insert(main_index, exec_msg)
+                            exec_index = main_index
+                            main_index += 1
+                            yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                            # Replace any local paths with uploaded URLs
+                            if hasattr(self, 'file_url_mapping'):
+                                for k, v in list(arguments.items()):
+                                    if isinstance(v, str) and v.startswith('/tmp/gradio/'):
+                                        for lpath, url in self.file_url_mapping.items():
+                                            if lpath in v or v in lpath:
+                                                arguments[k] = url
+                                                break
+                            # Run tool (blocking)
+                            def _run_tool():
+                                loop = asyncio.new_event_loop()
+                                asyncio.set_event_loop(loop)
+                                try:
+                                    return loop.run_until_complete(
+                                        self.mcp_client.call_mcp_tool_async(server_name, tool_name, arguments)
+                                    )
+                                finally:
+                                    loop.close()
+                            success, result = _run_tool()
+                            # Update exec message
+                            if success:
+                                content = str(result)
+                                history[exec_index] = ChatMessage(
+                                    role="assistant",
+                                    content=content if len(content) < 800 else content[:800] + "…",
+                                    metadata={"title": "📊 Server Response", "status": "done"}
+                                )
+                            else:
+                                history[exec_index] = ChatMessage(
+                                    role="assistant",
+                                    content=f"❌ Tool failed: {result}",
+                                    metadata={"title": "📊 Server Response", "status": "done"}
+                                )
+                            yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                            # Start final streamed response using tool result
+                            final_messages = messages.copy()
+                            # Remove tools instruction portion from system if present
+                            if final_messages and final_messages[0].get("role") == "system":
+                                sys_text = final_messages[0]["content"]
+                                cut = sys_text.split("You have access to the following MCP tools:")[0].strip()
+                                final_messages[0]["content"] = cut
+                            # Add prior assistant (planning) and user tool result follow-up
+                            final_messages.append({"role": "assistant", "content": text_accum})
+                            final_messages.append({
+                                "role": "user",
+                                "content": f"Tool '{tool_name}' from server '{server_name}' completed. Result: {result}. Please provide a helpful response."
+                            })
+                            # Stream final answer into main message
+                            final_accum = ""
+                            try:
+                                final_stream = self.mcp_client.generate_chat_completion_stream(final_messages, **{"max_tokens": max_tokens})
+                                for fchunk in final_stream:
+                                    fdelta = getattr(fchunk.choices[0], "delta", None)
+                                    ftext = getattr(fdelta, "content", "") if fdelta is not None else ""
+                                    if not ftext:
+                                        yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                                        continue
+                                    final_accum += ftext
+                                    history[main_index] = ChatMessage(role="assistant", content=(text_accum + final_accum))
+                                    yield history, gr.MultimodalTextbox(value=None, interactive=False)
+                            except Exception:
+                                # Fallback non-stream finalization
+                                try:
+                                    fresp = self.mcp_client.generate_chat_completion(final_messages, **{"max_tokens": max_tokens})
+                                    ftxt = fresp.choices[0].message.content or ""
+                                    history[main_index] = ChatMessage(role="assistant", content=(text_accum + ftxt))
+                                    yield history, gr.MultimodalTextbox(value=None, interactive=True)
+                                    return
+                                except Exception as e3:
+                                    history[main_index] = ChatMessage(role="assistant", content=(text_accum + f"\n❌ Finalization failed: {e3}"))
+                                    yield history, gr.MultimodalTextbox(value=None, interactive=True)
+                                    return
+                            # Done
+                            yield history, gr.MultimodalTextbox(value=None, interactive=True)
+                            return
+                else:
+                    # Normal assistant visible text outside of tool JSON
+                    text_accum += piece
+                    history[main_index] = ChatMessage(role="assistant", content=text_accum)
+                yield history, gr.MultimodalTextbox(value=None, interactive=False)
+        except Exception as e:
+            # Fallback: Use non-streaming MCP path
+            responses = self._call_hf_with_mcp(messages, uploaded_file_urls)
+            history.extend(responses)
+            yield history, gr.MultimodalTextbox(value=None, interactive=True)
+            return
+        # If we streamed without any tool usage, finalize
+        yield history, gr.MultimodalTextbox(value=None, interactive=True)
     def _extract_media_url(self, result_text: str, server_name: str) -> Optional[str]:
         """Extract media URL from MCP response with improved pattern matching"""
 - ALWAYS provide a descriptive message before the JSON tool call
 - After tool execution, you can provide additional context or ask if the user needs anything else
 Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+Current model: {self.mcp_client.current_model} via {self.mcp_client.current_provider}"""