Spaces:

chinmayjha
/

context-ai

Sleeping

chinmayjha commited on Sep 10

Commit

0711be9

1 Parent(s): 8099e26

Improve source parsing and answer formatting

- Fix source deduplication to use document ID instead of title+date
- Extract all 5 sources instead of just 1 when multiple documents have same title/date
- Add rich source information including key findings, marketing insights, and quotes
- Improve answer formatting to properly parse JSON and convert \n to line breaks
- Fix import error in tools/app.py
- Enhanced UI display with better source cards and formatting

Files changed (4) hide show

configs/compute_rag_vector_index_openai_contextual_simple.yaml +1 -1
src/second_brain_online/application/ui/custom_gradio_ui.py +178 -77
src/second_brain_online/config.py +3 -3
tools/app.py +56 -2

configs/compute_rag_vector_index_openai_contextual_simple.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 parameters:
   extract_collection_name: raw
   fetch_limit: 200
-  load_collection_name: rag
   content_quality_score_threshold: 0.6
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small

 parameters:
   extract_collection_name: raw
   fetch_limit: 200
+  load_collection_name: rag_insights_test
   content_quality_score_threshold: 0.6
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small

src/second_brain_online/application/ui/custom_gradio_ui.py CHANGED Viewed

@@ -57,15 +57,13 @@ class CustomGradioUI:
             gr.Markdown("# 🧠 Second Brain AI Assistant")
             gr.Markdown("Ask questions about your documents and get AI-powered insights with source attribution.")
-            with gr.Row():
-                with gr.Column(scale=4):
-                    self.query_input = gr.Textbox(
-                        label="Ask a question",
-                        placeholder="What pricing objections were raised in the meetings?",
-                        lines=2
-                    )
-                with gr.Column(scale=1):
-                    self.submit_btn = gr.Button("Ask", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
@@ -103,14 +101,25 @@ class CustomGradioUI:
             # Run the agent
             result = self.agent.run(query)
-            # Parse the result
-            answer, sources, tools_used = self.parse_agent_response(result)
             # Debug information
-            print(f"DEBUG - Raw result: {str(result)[:200]}...")
-            print(f"DEBUG - Parsed answer: {answer[:100]}...")
-            print(f"DEBUG - Sources found: {len(sources)}")
-            print(f"DEBUG - Tools found: {tools_used}")
             # Format outputs
             answer_html = self.format_answer(answer)
@@ -124,7 +133,7 @@ class CustomGradioUI:
             error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
             return error_msg, "", "", str(e)
-    def parse_agent_response(self, result: Any) -> Tuple[str, List[Dict], List[str]]:
         """Parse the agent response to extract answer, sources, and tools used."""
         answer = ""
         sources = []
@@ -133,20 +142,7 @@ class CustomGradioUI:
         # Convert result to string if it's not already
         result_str = str(result)
-        # Extract tool usage from the result first
-        # Pattern 1: 🛠️ Used tool toolname
-        tool_pattern1 = r'🛠️ Used tool (\w+)'
-        tool_matches1 = re.findall(tool_pattern1, result_str)
-        # Pattern 2: Calling tool: 'toolname'
-        tool_pattern2 = r"Calling tool:\s*'([^']+)'"
-        tool_matches2 = re.findall(tool_pattern2, result_str)
-        # Combine both patterns
-        all_tool_matches = tool_matches1 + tool_matches2
-        tools_used = list(set(all_tool_matches))  # Remove duplicates
-        # Try multiple patterns to extract the answer
         # Pattern 1: JSON format with "answer" key
         json_match = re.search(r'{"answer":\s*"([^"]+)"}', result_str)
         if json_match:
@@ -166,53 +162,106 @@ class CustomGradioUI:
                 # Pattern 3: Use the entire result as answer if no specific pattern matches
                 answer = result_str
-        # Extract sources from the answer text using multiple patterns
-        # Pattern 1: (Document: "Title", Date)
-        source_pattern1 = r'\(Document:\s*"([^"]+)",\s*([^)]+)\)'
-        source_matches1 = re.findall(source_pattern1, answer)
-        # Pattern 2: (Document: Title, Date) - without quotes
-        source_pattern2 = r'\(Document:\s*([^,]+),\s*([^)]+)\)'
-        source_matches2 = re.findall(source_pattern2, answer)
-        # Pattern 3: (Document 1, Date) - numbered format
-        source_pattern3 = r'\(Document\s+(\d+),\s*([^)]+)\)'
-        source_matches3 = re.findall(source_pattern3, answer)
-        # Pattern 4: (from "Title" on Date) - new format seen in output
-        source_pattern4 = r'\(from\s+"([^"]+)"\s+on\s+([^)]+)\)'
-        source_matches4 = re.findall(source_pattern4, answer)
-        # Pattern 5: (from "Title" on Date) - without quotes
-        source_pattern5 = r'\(from\s+([^"]+)\s+on\s+([^)]+)\)'
-        source_matches5 = re.findall(source_pattern5, answer)
-        # Combine all patterns
-        all_source_matches = source_matches1 + source_matches2 + source_matches3 + source_matches4 + source_matches5
-        for doc_title, doc_date in all_source_matches:
-            # Clean up the title and date
-            clean_title = doc_title.strip().strip('"')
-            clean_date = doc_date.strip()
-            # Handle numbered documents (Document 1, Document 2, etc.)
-            if clean_title.isdigit():
-                clean_title = f"Document {clean_title}"
-            sources.append({
-                "title": clean_title,
-                "date": clean_date
-            })
-        # Remove duplicates based on title and date
         unique_sources = []
         seen = set()
         for source in sources:
-            key = (source["title"], source["date"])
             if key not in seen:
                 seen.add(key)
                 unique_sources.append(source)
         return answer, unique_sources, tools_used
     def format_answer(self, answer: str) -> str:
@@ -220,21 +269,33 @@ class CustomGradioUI:
         if not answer:
             return "<div class='answer-section'><p>No answer provided.</p></div>"
         # Remove source references from the answer text for cleaner display
         answer = re.sub(r'\(Document:[^)]+\)', '', answer)
-        # Clean up extra whitespace
-        answer = re.sub(r'\s+', ' ', answer).strip()
         # Format numbered lists and bullet points
-        answer = re.sub(r'\n\s*\d+\.\s*', '<br><br><strong>', answer)  # Numbered lists
-        answer = re.sub(r'\n\s*•\s*', '<br>• ', answer)  # Bullet points
-        answer = re.sub(r'\n\s*-\s*', '<br>• ', answer)  # Dash points
         # Format bold text (markdown style)
         answer = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', answer)
-        # Format line breaks
         answer = answer.replace('\n', '<br>')
         # Clean up multiple line breaks
@@ -248,19 +309,59 @@ class CustomGradioUI:
         """
     def format_sources(self, sources: List[Dict]) -> str:
-        """Format the sources with proper HTML structure."""
         if not sources:
             return "<div><h3>📚 Sources</h3><p>No sources found.</p></div>"
         sources_html = "<div><h3>📚 Sources</h3>"
         for i, source in enumerate(sources, 1):
             sources_html += f"""
-            <div class='source-card'>
-                <div class='source-title'>{i}. {source['title']}</div>
-                <div class='source-date'>📅 {source['date']}</div>
-            </div>
             """
         sources_html += "</div>"
         return sources_html

             gr.Markdown("# 🧠 Second Brain AI Assistant")
             gr.Markdown("Ask questions about your documents and get AI-powered insights with source attribution.")
+            self.query_input = gr.Textbox(
+                label="Ask a question",
+                placeholder="What pricing objections were raised in the meetings?",
+                lines=2
+            )
+            self.submit_btn = gr.Button("Ask", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
             # Run the agent
             result = self.agent.run(query)
+            # Parse the result with agent logs
+            agent_logs = getattr(self.agent, 'logs', []) if hasattr(self.agent, 'logs') else []
+            answer, sources, tools_used = self.parse_agent_response(result, agent_logs)
             # Debug information
+            print("\n" + "="*80)
+            print("DEBUG: RAW AGENT RESULT")
+            print("="*80)
+            print(f"Type: {type(result)}")
+            print(f"Full Content:\n{result}")
+            print("="*80)
+            print("\n" + "="*80)
+            print("DEBUG: PARSED RESULTS")
+            print("="*80)
+            print(f"Answer: {answer}")
+            print(f"Sources ({len(sources)}): {sources}")
+            print(f"Tools Used: {tools_used}")
+            print("="*80)
             # Format outputs
             answer_html = self.format_answer(answer)
             error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
             return error_msg, "", "", str(e)
+    def parse_agent_response(self, result: Any, agent_logs: List = None) -> Tuple[str, List[Dict], List[str]]:
         """Parse the agent response to extract answer, sources, and tools used."""
         answer = ""
         sources = []
         # Convert result to string if it's not already
         result_str = str(result)
+        # Extract the answer from the result
         # Pattern 1: JSON format with "answer" key
         json_match = re.search(r'{"answer":\s*"([^"]+)"}', result_str)
         if json_match:
                 # Pattern 3: Use the entire result as answer if no specific pattern matches
                 answer = result_str
+        # If we have agent logs, extract tools and sources from them
+        if agent_logs:
+            for step in agent_logs:
+                # Extract tool calls
+                if hasattr(step, 'tool_calls') and step.tool_calls:
+                    for tool_call in step.tool_calls:
+                        if hasattr(tool_call, 'name'):
+                            tools_used.append(tool_call.name)
+                # Extract sources from observations
+                if hasattr(step, 'observations') and step.observations:
+                    # Look for complete document blocks with all content
+                    document_pattern = r'<document id="(\d+)">\s*<title>(.*?)</title>\s*<date>(.*?)</date>\s*<contextual_summary>(.*?)</contextual_summary>\s*<marketing_insights>(.*?)</marketing_insights>\s*<content>(.*?)</content>'
+                    document_matches = re.findall(document_pattern, step.observations, re.DOTALL)
+                    for doc_id, doc_title, doc_date, contextual_summary, marketing_insights, content in document_matches:
+                        # Clean up the basic fields
+                        clean_title = doc_title.strip()
+                        clean_date = doc_date.strip()
+                        clean_summary = contextual_summary.strip()
+                        # Extract key findings from marketing insights
+                        key_findings = []
+                        key_findings_pattern = r'<key_findings>(.*?)</key_findings>'
+                        key_findings_match = re.search(key_findings_pattern, marketing_insights, re.DOTALL)
+                        if key_findings_match:
+                            key_findings_text = key_findings_match.group(1).strip()
+                            # Split by lines and clean up
+                            key_findings = [line.strip() for line in key_findings_text.split('\n') if line.strip() and line.strip().startswith('-')]
+                        # Extract quotes from marketing insights
+                        quotes = []
+                        quotes_pattern = r'<quotes>(.*?)</quotes>'
+                        quotes_match = re.search(quotes_pattern, marketing_insights, re.DOTALL)
+                        if quotes_match:
+                            quotes_text = quotes_match.group(1).strip()
+                            # Split by lines and clean up
+                            quotes = [line.strip() for line in quotes_text.split('\n') if line.strip() and line.strip().startswith('-')]
+                        sources.append({
+                            "id": doc_id,
+                            "title": clean_title,
+                            "date": clean_date,
+                            "summary": clean_summary,
+                            "key_findings": key_findings,
+                            "quotes": quotes
+                        })
+        # Fallback: Try to extract from result string if no logs provided
+        if not agent_logs:
+            # Extract tool usage from the result first
+            # Pattern 1: 🛠️ Used tool toolname
+            tool_pattern1 = r'🛠️ Used tool (\w+)'
+            tool_matches1 = re.findall(tool_pattern1, result_str)
+            # Pattern 2: Calling tool: 'toolname' (with single quotes)
+            tool_pattern2 = r"Calling tool:\s*'([^']+)'"
+            tool_matches2 = re.findall(tool_pattern2, result_str)
+            # Pattern 3: Calling tool: 'toolname' (with double quotes)
+            tool_pattern3 = r'Calling tool:\s*"([^"]+)"'
+            tool_matches3 = re.findall(tool_pattern3, result_str)
+            # Pattern 4: Calling tool: toolname (without quotes)
+            tool_pattern4 = r'Calling tool:\s*([a-zA-Z_][a-zA-Z0-9_]*)'
+            tool_matches4 = re.findall(tool_pattern4, result_str)
+            # Combine all patterns
+            all_tool_matches = tool_matches1 + tool_matches2 + tool_matches3 + tool_matches4
+            tools_used = list(set(all_tool_matches))  # Remove duplicates
+            # Extract sources from the structured search_results format
+            # Look for <document> tags in the search results
+            document_pattern = r'<document id="(\d+)">\s*<title>(.*?)</title>\s*<date>(.*?)</date>'
+            document_matches = re.findall(document_pattern, result_str, re.DOTALL)
+            for doc_id, doc_title, doc_date in document_matches:
+                # Clean up the title and date
+                clean_title = doc_title.strip()
+                clean_date = doc_date.strip()
+                sources.append({
+                    "id": doc_id,
+                    "title": clean_title,
+                    "date": clean_date
+                })
+        # Remove duplicates based on document ID (keep all unique documents)
         unique_sources = []
         seen = set()
         for source in sources:
+            # Use document ID as the unique key, fallback to title+date if no ID
+            key = source.get("id", f"{source['title']}_{source['date']}")
             if key not in seen:
                 seen.add(key)
                 unique_sources.append(source)
+        # Remove duplicate tools
+        tools_used = list(set(tools_used))
         return answer, unique_sources, tools_used
     def format_answer(self, answer: str) -> str:
         if not answer:
             return "<div class='answer-section'><p>No answer provided.</p></div>"
+        # Check if the answer is a JSON string and extract the actual answer
+        if answer.strip().startswith('{"answer":') and answer.strip().endswith('}'):
+            try:
+                import json
+                answer_data = json.loads(answer)
+                if isinstance(answer_data, dict) and 'answer' in answer_data:
+                    answer = answer_data['answer']
+            except (json.JSONDecodeError, KeyError):
+                # If JSON parsing fails, use the original answer
+                pass
         # Remove source references from the answer text for cleaner display
         answer = re.sub(r'\(Document:[^)]+\)', '', answer)
+        # Clean up extra whitespace but preserve intentional line breaks
+        answer = re.sub(r'[ \t]+', ' ', answer)  # Replace multiple spaces/tabs with single space
+        answer = re.sub(r' *\n *', '\n', answer)  # Clean up spaces around newlines
         # Format numbered lists and bullet points
+        answer = re.sub(r'\n\s*\d+\.\s*', '\n\n<strong>', answer)  # Numbered lists
+        answer = re.sub(r'\n\s*•\s*', '\n• ', answer)  # Bullet points
+        answer = re.sub(r'\n\s*-\s*', '\n• ', answer)  # Dash points
         # Format bold text (markdown style)
         answer = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', answer)
+        # Convert line breaks to HTML
         answer = answer.replace('\n', '<br>')
         # Clean up multiple line breaks
         """
     def format_sources(self, sources: List[Dict]) -> str:
+        """Format the sources with rich information including key findings and marketing insights."""
         if not sources:
             return "<div><h3>📚 Sources</h3><p>No sources found.</p></div>"
         sources_html = "<div><h3>📚 Sources</h3>"
         for i, source in enumerate(sources, 1):
+            title = source.get("title", "Unknown")
+            date = source.get("date", "Unknown")
+            doc_id = source.get("id", "")
+            summary = source.get("summary", "")
+            key_findings = source.get("key_findings", [])
+            quotes = source.get("quotes", [])
             sources_html += f"""
+            <div class='source-card' style='margin-bottom: 20px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 8px; background-color: #f9f9f9;'>
+                <div class='source-title' style='font-weight: bold; font-size: 16px; margin-bottom: 8px;'>{i}. {title}</div>
+                <div class='source-meta' style='color: #666; margin-bottom: 10px;'>
+                    📅 {date}
+                    {f" | ID: {doc_id}" if doc_id else ""}
+                </div>
             """
+            if summary:
+                sources_html += f"""
+                <div class='source-summary' style='margin-bottom: 10px;'>
+                    <strong>Summary:</strong> {summary}
+                </div>
+                """
+            if key_findings:
+                sources_html += """
+                <div class='source-findings' style='margin-bottom: 10px;'>
+                    <strong>Key Findings:</strong>
+                    <ul style='margin: 5px 0; padding-left: 20px;'>
+                """
+                for finding in key_findings:
+                    clean_finding = finding.lstrip('- ').strip()
+                    sources_html += f"<li style='margin-bottom: 3px;'>{clean_finding}</li>"
+                sources_html += "</ul></div>"
+            if quotes:
+                sources_html += """
+                <div class='source-quotes' style='margin-bottom: 10px;'>
+                    <strong>Key Quotes:</strong>
+                    <ul style='margin: 5px 0; padding-left: 20px;'>
+                """
+                for quote in quotes:
+                    clean_quote = quote.lstrip('- ').strip()
+                    sources_html += f"<li style='margin-bottom: 3px; font-style: italic; color: #555;'>{clean_quote}</li>"
+                sources_html += "</ul></div>"
+            sources_html += "</div>"
         sources_html += "</div>"
         return sources_html

src/second_brain_online/config.py CHANGED Viewed

@@ -15,7 +15,7 @@ class Settings(BaseSettings):
     # --- Comet ML & Opik Configuration ---
     COMET_API_KEY: str | None = Field(
-        default=None, description="API key for Comet ML and Opik services."
     )
     COMET_PROJECT: str = Field(
         default="second_brain_course",
@@ -44,11 +44,11 @@ class Settings(BaseSettings):
         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
-        default="rag",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
-        default="mongodb+srv://contextdb:HOqIgSH01CoEiMb1@cluster0.d9cmff.mongodb.net/",
         description="Connection URI for the MongoDB Atlas instance.",
     )

     # --- Comet ML & Opik Configuration ---
     COMET_API_KEY: str | None = Field(
+        default="yPmLa7W6QyBODw1Pnfg9jqr7E", description="API key for Comet ML and Opik services."
     )
     COMET_PROJECT: str = Field(
         default="second_brain_course",
         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
+        default="rag_insights_test",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
+        default="mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/?retryWrites=true&w=majority&appName=saaslabs",
         description="Connection URI for the MongoDB Atlas instance.",
     )

tools/app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 import click
 from second_brain_online.application.agents import get_agent
-from second_brain_online.application.ui import CustomGradioUI
 @click.command()
@@ -43,7 +43,61 @@ def main(retriever_config_path: Path, ui: bool, query: str) -> None:
         result = agent.run(query)
-        print(result)
 if __name__ == "__main__":

 import click
 from second_brain_online.application.agents import get_agent
+from second_brain_online.application.ui.custom_gradio_ui import CustomGradioUI
 @click.command()
         result = agent.run(query)
+        # DEBUG: Print raw result
+        print("\n" + "="*80)
+        print("DEBUG: RAW AGENT RESULT")
+        print("="*80)
+        print(f"Type: {type(result)}")
+        print(f"Full Content:\n{result}")
+        print("="*80)
+        # DEBUG: Check agent object attributes
+        print("\n" + "="*80)
+        print("DEBUG: AGENT OBJECT ATTRIBUTES")
+        print("="*80)
+        print(f"Agent type: {type(agent)}")
+        print(f"Agent attributes: {dir(agent)}")
+        if hasattr(agent, '_AgentWrapper__agent'):
+            actual_agent = agent._AgentWrapper__agent
+            print(f"Actual agent type: {type(actual_agent)}")
+            print(f"Actual agent attributes: {dir(actual_agent)}")
+            if hasattr(actual_agent, 'conversation_history'):
+                print(f"Conversation history: {actual_agent.conversation_history}")
+            if hasattr(actual_agent, 'messages'):
+                print(f"Messages: {actual_agent.messages}")
+            if hasattr(actual_agent, 'logs'):
+                print(f"Logs: {actual_agent.logs}")
+            if hasattr(actual_agent, 'state'):
+                print(f"State: {actual_agent.state}")
+        print("="*80)
+        # Parse the result using the same logic as the UI
+        ui_instance = CustomGradioUI(None)  # We don't need the agent for parsing
+        # Get agent logs if available
+        agent_logs = []
+        if hasattr(agent, '_AgentWrapper__agent'):
+            actual_agent = agent._AgentWrapper__agent
+            if hasattr(actual_agent, 'logs'):
+                agent_logs = actual_agent.logs
+        answer, sources, tools_used = ui_instance.parse_agent_response(result, agent_logs)
+        print("\n" + "="*80)
+        print("DEBUG: PARSED RESULTS")
+        print("="*80)
+        print(f"Answer: {answer}")
+        print(f"Sources ({len(sources)}): {sources}")
+        print(f"Tools Used: {tools_used}")
+        print("="*80)
+        print("\n" + "="*80)
+        print("FINAL OUTPUT")
+        print("="*80)
+        # Format the answer for better display
+        formatted_answer = ui_instance.format_answer(answer)
+        print(formatted_answer)
 if __name__ == "__main__":