Spaces:

chinmayjha
/

context-ai

Sleeping

App Files Files Community

chinmayjha commited on Sep 24

Commit

8223f74

unverified ·

1 Parent(s): d8a714e

Enhanced conversation analysis UI with customer details and migrated to Keshav MongoDB

Browse files

- Consolidated customer info (user_id, region, country, team_size) into single column
- Updated MongoDB configuration to point to Keshav's instance
- Migrated test_conversation_documents and rag_conversations collections
- Enhanced conversation table with search and filtering capabilities
- Improved UI layout with collapsible sections for sources and tools
- Added conversation analysis pipeline integration
- Updated retriever configuration for conversation data

Files changed (18) hide show

.gradio/certificate.pem +31 -0
app.py +1 -1
config.py +3 -2
compute_rag_vector_index_openai_contextual_simple.yaml → configs/compute_rag_vector_index_conversations.yaml +8 -8
configs/compute_rag_vector_index_openai_contextual_reranked.yaml +12 -0
configs/compute_rag_vector_index_openai_contextual_simple.yaml +2 -2
conversation_analysis_app.py +0 -45
correct_init.py +0 -9
init_fixed.py +0 -9
migrate_mongodb_data.py +0 -139
pyproject.toml +1 -0
src/second_brain_online/application/agents/tools/mongodb_retriever.py +4 -0
src/second_brain_online/application/rag/retrievers.py +35 -6
src/second_brain_online/application/ui/custom_gradio_ui.py +383 -13
src/second_brain_online/config.py +3 -2
temp_init.py +0 -9
uv.lock +2 -0
what_can_i_do.py +0 -60

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from second_brain_online import opik_utils
 def main():
     """Main function for Hugging Face Space deployment."""
     # Set default values for HF Spaces
-    retriever_config_path = os.getenv("RETRIEVER_CONFIG_PATH", "configs/compute_rag_vector_index_openai_contextual_simple.yaml")
     print("🚀 Starting Second Brain AI Assistant...")
     print(f"📁 Using retriever config: {retriever_config_path}")

 def main():
     """Main function for Hugging Face Space deployment."""
     # Set default values for HF Spaces
+    retriever_config_path = os.getenv("RETRIEVER_CONFIG_PATH", "configs/compute_rag_vector_index_conversations.yaml")
     print("🚀 Starting Second Brain AI Assistant...")
     print(f"📁 Using retriever config: {retriever_config_path}")

config.py CHANGED Viewed

@@ -44,11 +44,12 @@ class Settings(BaseSettings):
         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
-        default="rag_insights_test",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
-        default="mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/?retryWrites=true&w=majority&appName=saaslabs",
         description="Connection URI for the MongoDB Atlas instance.",
     )

         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
+        default="rag_conversations",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
+        default="mongodb+srv://contextdb:HOqIgSH01CoEiMb1@cluster0.d9cmff.mongodb.net/",
+        # default="mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/?retryWrites=true&w=majority&appName=saaslabs",
         description="Connection URI for the MongoDB Atlas instance.",
     )

compute_rag_vector_index_openai_contextual_simple.yaml → configs/compute_rag_vector_index_conversations.yaml RENAMED Viewed

@@ -1,17 +1,17 @@
 parameters:
-  extract_collection_name: raw
-  fetch_limit: 200
-  load_collection_name: rag_insights_test
-  content_quality_score_threshold: 0.6
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small
   embedding_model_type: openai
   embedding_model_dim: 1536
   chunk_size: 640
   contextual_summarization_type: contextual
-  contextual_agent_model_id: gpt-4o
-  contextual_agent_max_characters: 128
   mock: false
-  processing_batch_size: 2
   processing_max_workers: 2
-  device: mps # or cuda (for Nvidia GPUs) or mps (for Apple M1/M2/M3 chips)

 parameters:
+  extract_collection_name: test_conversation_documents
+  fetch_limit: 0  # No limit - get all conversations
+  load_collection_name: rag_conversations
+  content_quality_score_threshold: 0.0
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small
   embedding_model_type: openai
   embedding_model_dim: 1536
   chunk_size: 640
   contextual_summarization_type: contextual
+  contextual_agent_model_id: gpt-4o-mini
+  contextual_agent_max_characters: 200
   mock: false
+  processing_batch_size: 5
   processing_max_workers: 2
+  device: mps  # or cuda (for Nvidia GPUs) or mps (for Apple M1/M2/M3 chips)

configs/compute_rag_vector_index_openai_contextual_reranked.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+parameters:
+  retriever_type: contextual_reranked  # Enable re-ranking
+  embedding_model_id: text-embedding-3-small
+  embedding_model_type: openai
+  embedding_model_dim: 1536
+  device: mps  # or cuda (for Nvidia GPUs) or mps (for Apple M1/M2/M3 chips)
+  # Re-ranking parameters
+  enable_reranking: true
+  rerank_model_name: "cross-encoder/ms-marco-MiniLM-L-2-v2"
+  stage1_limit: 50  # Retrieve 50 candidates in stage 1
+  final_k: 10  # Return top 10 after re-ranking

configs/compute_rag_vector_index_openai_contextual_simple.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 parameters:
-  extract_collection_name: raw
   fetch_limit: 200
-  load_collection_name: rag_insights_test
   content_quality_score_threshold: 0.6
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small

 parameters:
+  extract_collection_name: test_intercom_data
   fetch_limit: 200
+  load_collection_name: rag_intercom
   content_quality_score_threshold: 0.6
   retriever_type: contextual
   embedding_model_id: text-embedding-3-small

conversation_analysis_app.py DELETED Viewed

@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""
-Hugging Face Space app for Conversation Analysis Dashboard.
-This app displays conversation analysis results in a tabular format,
-showing insights, summaries, and follow-up emails for all conversations
-from the test_intercom_data collection.
-"""
-import os
-import sys
-from pathlib import Path
-# Add paths
-sys.path.append('.')
-sys.path.append('src')
-from second_brain_online.application.ui.conversation_analysis_ui import ConversationAnalysisUI
-def main():
-    """Main function for HF Space deployment."""
-    print("🚀 Starting Conversation Analysis Dashboard...")
-    print("📊 Loading conversation analysis data from MongoDB...")
-    try:
-        # Initialize UI
-        ui = ConversationAnalysisUI()
-        print("✅ UI initialized successfully")
-        print("🌐 Launching Gradio interface...")
-        # Launch the interface
-        ui.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=True,
-            show_error=True
-        )
-    except Exception as e:
-        print(f"❌ Error starting the application: {e}")
-        raise
-if __name__ == "__main__":
-    main()

correct_init.py DELETED Viewed

@@ -1,9 +0,0 @@
-from . import agents, rag
-# Optional import for evaluation - may cause issues in some environments
-try:
-    from .evaluation import evaluate
-    __all__ = ["rag", "agents", "evaluate"]
-except ImportError as e:
-    print(f"Warning: Could not import evaluation module: {e}")
-    __all__ = ["rag", "agents"]

init_fixed.py DELETED Viewed

@@ -1,9 +0,0 @@
-from . import agents, rag
-# Optional import for evaluation - may cause issues in some environments
-try:
-    from .evaluation import evaluate
-    __all__ = ["rag", "agents", "evaluate"]
-except ImportError as e:
-    print(f"Warning: Could not import evaluation module: {e}")
-    __all__ = ["rag", "agents"]

migrate_mongodb_data.py DELETED Viewed

@@ -1,139 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to migrate test_intercom_data from contextdb instance to keshavchhaparia instance.
-"""
-import sys
-from pymongo import MongoClient
-from loguru import logger
-# Source MongoDB (contextdb instance)
-SOURCE_URI = "mongodb+srv://contextdb:HOqIgSH01CoEiMb1@cluster0.d9cmff.mongodb.net/"
-SOURCE_DB = "second_brain_course"
-SOURCE_COLLECTION = "test_intercom_data"
-# Target MongoDB (keshavchhaparia instance)
-TARGET_URI = "mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/"
-TARGET_DB = "second_brain_course"
-TARGET_COLLECTION = "test_intercom_data"
-def migrate_data():
-    """Migrate test_intercom_data collection from source to target MongoDB."""
-    logger.info("🚀 Starting MongoDB data migration...")
-    # Connect to source MongoDB
-    logger.info(f"📡 Connecting to source MongoDB: {SOURCE_URI}")
-    try:
-        source_client = MongoClient(SOURCE_URI)
-        source_db = source_client[SOURCE_DB]
-        source_collection = source_db[SOURCE_COLLECTION]
-        logger.info("✅ Connected to source MongoDB")
-    except Exception as e:
-        logger.error(f"❌ Failed to connect to source MongoDB: {e}")
-        return False
-    # Connect to target MongoDB
-    logger.info(f"📡 Connecting to target MongoDB: {TARGET_URI}")
-    try:
-        target_client = MongoClient(TARGET_URI)
-        target_db = target_client[TARGET_DB]
-        target_collection = target_db[TARGET_COLLECTION]
-        logger.info("✅ Connected to target MongoDB")
-    except Exception as e:
-        logger.error(f"❌ Failed to connect to target MongoDB: {e}")
-        return False
-    try:
-        # Get document count from source
-        source_count = source_collection.count_documents({})
-        logger.info(f"📊 Source collection has {source_count} documents")
-        if source_count == 0:
-            logger.warning("⚠️ Source collection is empty, nothing to migrate")
-            return True
-        # Delete existing target collection
-        logger.info(f"🗑️ Deleting existing target collection: {TARGET_COLLECTION}")
-        target_collection.drop()
-        logger.info("✅ Target collection deleted")
-        # Copy documents from source to target
-        logger.info("📋 Copying documents from source to target...")
-        # Process in batches to avoid memory issues
-        batch_size = 100
-        total_copied = 0
-        for skip in range(0, source_count, batch_size):
-            # Get batch of documents
-            documents = list(source_collection.find().skip(skip).limit(batch_size))
-            if documents:
-                # Insert batch into target
-                target_collection.insert_many(documents)
-                total_copied += len(documents)
-                logger.info(f"📦 Copied batch: {len(documents)} documents (Total: {total_copied}/{source_count})")
-        # Verify migration
-        target_count = target_collection.count_documents({})
-        logger.info(f"✅ Migration completed! Target collection has {target_count} documents")
-        if target_count == source_count:
-            logger.info("🎉 Migration successful - document counts match!")
-            return True
-        else:
-            logger.error(f"❌ Migration failed - document count mismatch: {target_count} vs {source_count}")
-            return False
-    except Exception as e:
-        logger.error(f"❌ Migration failed: {e}")
-        return False
-    finally:
-        # Close connections
-        source_client.close()
-        target_client.close()
-        logger.info("🔌 MongoDB connections closed")
-def verify_migration():
-    """Verify the migration was successful."""
-    logger.info("🔍 Verifying migration...")
-    try:
-        # Connect to target MongoDB
-        target_client = MongoClient(TARGET_URI)
-        target_db = target_client[TARGET_DB]
-        target_collection = target_db[TARGET_COLLECTION]
-        # Get sample documents
-        sample_docs = list(target_collection.find().limit(3))
-        logger.info(f"📋 Sample documents in target collection:")
-        for i, doc in enumerate(sample_docs, 1):
-            conversation_id = doc.get('metadata', {}).get('properties', {}).get('conversation_id', 'N/A')
-            has_analysis = 'conversation_analysis' in doc
-            quality_score = doc.get('content_quality_score', 'N/A')
-            logger.info(f"  {i}. Conversation ID: {conversation_id}, Has Analysis: {has_analysis}, Quality: {quality_score}")
-        target_client.close()
-        logger.info("✅ Verification completed")
-    except Exception as e:
-        logger.error(f"❌ Verification failed: {e}")
-if __name__ == "__main__":
-    logger.info("=" * 60)
-    logger.info("🔄 MongoDB Data Migration Script")
-    logger.info("=" * 60)
-    # Run migration
-    success = migrate_data()
-    if success:
-        # Verify migration
-        verify_migration()
-        logger.info("🎉 Migration completed successfully!")
-    else:
-        logger.error("❌ Migration failed!")
-        sys.exit(1)

pyproject.toml CHANGED Viewed

@@ -26,6 +26,7 @@ dependencies = [
     "comet_ml>=3.47.6",
     "langchain-huggingface>=0.1.2",
     "huggingface-hub>=0.27.1",
 ]
 [dependency-groups]

     "comet_ml>=3.47.6",
     "langchain-huggingface>=0.1.2",
     "huggingface-hub>=0.27.1",
+    "sentence-transformers>=3.0.0",
 ]
 [dependency-groups]

src/second_brain_online/application/agents/tools/mongodb_retriever.py CHANGED Viewed

@@ -44,6 +44,10 @@ class MongoDBRetrieverTool(Tool):
             retriever_type=config["retriever_type"],
             k=5,
             device=config["device"],
         )
     @track(name="MongoDBRetrieverTool.forward")

             retriever_type=config["retriever_type"],
             k=5,
             device=config["device"],
+            enable_reranking=config.get("enable_reranking", False),
+            rerank_model_name=config.get("rerank_model_name", "cross-encoder/ms-marco-MiniLM-L-2-v2"),
+            stage1_limit=config.get("stage1_limit", 50),
+            final_k=config.get("final_k", 10),
         )
     @track(name="MongoDBRetrieverTool.forward")

src/second_brain_online/application/rag/retrievers.py CHANGED Viewed

@@ -13,9 +13,11 @@ from .embeddings import EmbeddingModelType, EmbeddingsModel, get_embedding_model
 from .splitters import get_splitter
 # Add these type definitions at the top of the file
-RetrieverType = Literal["contextual", "parent"]
 RetrieverModel = Union[
-    MongoDBAtlasHybridSearchRetriever, MongoDBAtlasParentDocumentRetriever
 ]
@@ -25,6 +27,10 @@ def get_retriever(
     retriever_type: RetrieverType = "contextual",
     k: int = 3,
     device: str = "cpu",
 ) -> RetrieverModel:
     logger.info(
         f"Getting '{retriever_type}' retriever for '{embedding_model_type}' - '{embedding_model_id}' on '{device}' "
@@ -35,13 +41,36 @@ def get_retriever(
         embedding_model_id, embedding_model_type, device
     )
-    if retriever_type == "contextual":
-        return get_hybrid_search_retriever(embedding_model, k)
-    elif retriever_type == "parent":
-        return get_parent_document_retriever(embedding_model, k)
     else:
         raise ValueError(f"Invalid retriever type: {retriever_type}")
 def get_hybrid_search_retriever(
     embedding_model: EmbeddingsModel, k: int

 from .splitters import get_splitter
 # Add these type definitions at the top of the file
+RetrieverType = Literal["contextual", "parent", "contextual_reranked", "parent_reranked"]
 RetrieverModel = Union[
+    MongoDBAtlasHybridSearchRetriever,
+    MongoDBAtlasParentDocumentRetriever,
+    "RerankingRetriever"
 ]
     retriever_type: RetrieverType = "contextual",
     k: int = 3,
     device: str = "cpu",
+    enable_reranking: bool = False,
+    rerank_model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
+    stage1_limit: int = 50,
+    final_k: int = 10,
 ) -> RetrieverModel:
     logger.info(
         f"Getting '{retriever_type}' retriever for '{embedding_model_type}' - '{embedding_model_id}' on '{device}' "
         embedding_model_id, embedding_model_type, device
     )
+    # Determine base retriever type
+    base_retriever_type = retriever_type
+    if retriever_type in ["contextual_reranked", "parent_reranked"]:
+        base_retriever_type = retriever_type.replace("_reranked", "")
+        enable_reranking = True
+    else:
+        enable_reranking = enable_reranking
+    # Create base retriever
+    if base_retriever_type == "contextual":
+        base_retriever = get_hybrid_search_retriever(embedding_model, k)
+    elif base_retriever_type == "parent":
+        base_retriever = get_parent_document_retriever(embedding_model, k)
     else:
         raise ValueError(f"Invalid retriever type: {retriever_type}")
+    # Wrap with re-ranking if enabled
+    if enable_reranking:
+        from second_brain_offline.application.rag.reranker import RerankingRetriever
+        logger.info(f"Enabling re-ranking with model: {rerank_model_name}")
+        logger.info(f"Stage 1 limit: {stage1_limit}, Final k: {final_k}")
+        return RerankingRetriever(
+            base_retriever=base_retriever,
+            rerank_model_name=rerank_model_name,
+            stage1_limit=stage1_limit,
+            final_k=final_k
+        )
+    return base_retriever
 def get_hybrid_search_retriever(
     embedding_model: EmbeddingsModel, k: int

src/second_brain_online/application/ui/custom_gradio_ui.py CHANGED Viewed

@@ -1,18 +1,40 @@
 import json
 import re
-from typing import Any, Dict, List, Tuple
 import gradio as gr
 from smolagents import ToolCallingAgent
 class CustomGradioUI:
     """Custom Gradio UI for better formatting of agent responses with source attribution."""
     def __init__(self, agent: ToolCallingAgent):
         self.agent = agent
         self.setup_ui()
     def setup_ui(self):
         """Setup the Gradio interface with custom components."""
         with gr.Blocks(
@@ -68,8 +90,31 @@ class CustomGradioUI:
             with gr.Row():
                 with gr.Column():
                     self.answer_output = gr.HTML(label="Answer")
-                    self.sources_output = gr.HTML(label="Sources")
-                    self.tools_output = gr.HTML(label="Tools Used")
             with gr.Accordion("🔍 Debug: Raw Response", open=False):
                 self.debug_output = gr.Textbox(
@@ -83,19 +128,33 @@ class CustomGradioUI:
             self.submit_btn.click(
                 fn=self.process_query,
                 inputs=[self.query_input],
-                outputs=[self.answer_output, self.sources_output, self.tools_output, self.debug_output]
             )
             self.query_input.submit(
                 fn=self.process_query,
                 inputs=[self.query_input],
-                outputs=[self.answer_output, self.sources_output, self.tools_output, self.debug_output]
             )
-    def process_query(self, query: str) -> Tuple[str, str, str, str]:
         """Process the user query and return formatted response components."""
         if not query.strip():
-            return "", "", "", ""
         try:
             # Run the agent
@@ -127,11 +186,14 @@ class CustomGradioUI:
             tools_html = self.format_tools(tools_used)
             debug_text = str(result)
-            return answer_html, sources_html, tools_html, debug_text
         except Exception as e:
             error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
-            return error_msg, "", "", str(e)
     def parse_agent_response(self, result: Any, agent_logs: List = None) -> Tuple[str, List[Dict], List[str]]:
         """Parse the agent response to extract answer, sources, and tools used."""
@@ -173,10 +235,14 @@ class CustomGradioUI:
                 # Extract sources from observations
                 if hasattr(step, 'observations') and step.observations:
                     # Look for complete document blocks with all content
                     document_pattern = r'<document id="(\d+)">\s*<title>(.*?)</title>\s*<date>(.*?)</date>\s*<contextual_summary>(.*?)</contextual_summary>\s*<marketing_insights>(.*?)</marketing_insights>\s*<content>(.*?)</content>'
                     document_matches = re.findall(document_pattern, step.observations, re.DOTALL)
                     for doc_id, doc_title, doc_date, contextual_summary, marketing_insights, content in document_matches:
                         # Clean up the basic fields
                         clean_title = doc_title.strip()
@@ -209,6 +275,40 @@ class CustomGradioUI:
                             "key_findings": key_findings,
                             "quotes": quotes
                         })
         # Fallback: Try to extract from result string if no logs provided
         if not agent_logs:
@@ -311,9 +411,9 @@ class CustomGradioUI:
     def format_sources(self, sources: List[Dict]) -> str:
         """Format the sources with rich information including key findings and marketing insights."""
         if not sources:
-            return "<div><h3>📚 Sources</h3><p>No sources found.</p></div>"
-        sources_html = "<div><h3>📚 Sources</h3>"
         for i, source in enumerate(sources, 1):
             title = source.get("title", "Unknown")
@@ -369,9 +469,9 @@ class CustomGradioUI:
     def format_tools(self, tools_used: List[str]) -> str:
         """Format the tools used with proper HTML structure."""
         if not tools_used:
-            return "<div><h3>🛠️ Tools Used</h3><p>No tools used.</p></div>"
-        tools_html = "<div><h3>🛠️ Tools Used</h3>"
         for tool in tools_used:
             tools_html += f"""
@@ -383,6 +483,276 @@ class CustomGradioUI:
         tools_html += "</div>"
         return tools_html
     def launch(self, **kwargs):
         """Launch the Gradio interface."""
         return self.interface.launch(**kwargs)

 import json
 import re
+from typing import Any, Dict, List, Tuple, Optional
+from datetime import datetime
 import gradio as gr
+import pandas as pd
+from pymongo import MongoClient
 from smolagents import ToolCallingAgent
+from second_brain_online.config import settings
 class CustomGradioUI:
     """Custom Gradio UI for better formatting of agent responses with source attribution."""
     def __init__(self, agent: ToolCallingAgent):
         self.agent = agent
+        self.mongodb_client = None
+        self.database = None
+        self.conversation_collection = None
+        self.setup_mongodb()
         self.setup_ui()
+    def setup_mongodb(self):
+        """Setup MongoDB connection."""
+        try:
+            self.mongodb_client = MongoClient(settings.MONGODB_URI)
+            self.database = self.mongodb_client[settings.MONGODB_DATABASE_NAME]
+            self.conversation_collection = self.database["test_conversation_documents"]
+            print("✅ MongoDB connection established successfully")
+        except Exception as e:
+            print(f"❌ Failed to connect to MongoDB: {e}")
+            self.mongodb_client = None
+            self.database = None
+            self.conversation_collection = None
     def setup_ui(self):
         """Setup the Gradio interface with custom components."""
         with gr.Blocks(
             with gr.Row():
                 with gr.Column():
                     self.answer_output = gr.HTML(label="Answer")
+            with gr.Accordion("📊 Conversations", open=False):
+                with gr.Row():
+                    self.conversation_search = gr.Textbox(
+                        label="Search Conversations",
+                        placeholder="Search by conversation ID, customer info, summary, or key findings...",
+                        scale=4
+                    )
+                    self.clear_search_btn = gr.Button("Clear", scale=1)
+                self.conversation_table = gr.Dataframe(
+                    headers=["Conversation ID", "Customer Info", "Summary", "Key Findings", "Follow-up Email"],
+                    datatype=["str", "str", "str", "str", "str"],
+                    interactive=False,
+                    label="Available Conversations",
+                    wrap=True,
+                    max_height=400,
+                    value=self.load_conversations()
+                )
+            with gr.Accordion("📚 Sources", open=False):
+                self.sources_output = gr.HTML(label="Sources")
+            with gr.Accordion("🛠️ Tools Used", open=False):
+                self.tools_output = gr.HTML(label="Tools Used")
             with gr.Accordion("🔍 Debug: Raw Response", open=False):
                 self.debug_output = gr.Textbox(
             self.submit_btn.click(
                 fn=self.process_query,
                 inputs=[self.query_input],
+                outputs=[self.answer_output, self.sources_output, self.tools_output, self.debug_output, self.conversation_table]
             )
             self.query_input.submit(
                 fn=self.process_query,
                 inputs=[self.query_input],
+                outputs=[self.answer_output, self.sources_output, self.tools_output, self.debug_output, self.conversation_table]
+            )
+            # Conversation search handlers
+            self.conversation_search.change(
+                fn=self.filter_conversations,
+                inputs=[self.conversation_search],
+                outputs=[self.conversation_table]
+            )
+            self.clear_search_btn.click(
+                fn=self.clear_conversation_search,
+                inputs=[],
+                outputs=[self.conversation_search, self.conversation_table]
             )
+    def process_query(self, query: str) -> Tuple[str, str, str, str, pd.DataFrame]:
         """Process the user query and return formatted response components."""
         if not query.strip():
+            # Clear all outputs when query is empty
+            return "", "", "", "", self.load_conversations()
         try:
             # Run the agent
             tools_html = self.format_tools(tools_used)
             debug_text = str(result)
+            # Filter conversations based on sources used
+            filtered_conversations = self.filter_conversations_by_sources(sources)
+            return answer_html, sources_html, tools_html, debug_text, filtered_conversations
         except Exception as e:
             error_msg = f"<div style='color: #dc3545; padding: 12px; border: 1px solid #f5c6cb; border-radius: 4px; background-color: #f8d7da;'>Error: {str(e)}</div>"
+            return error_msg, "", "", str(e), self.load_conversations()
     def parse_agent_response(self, result: Any, agent_logs: List = None) -> Tuple[str, List[Dict], List[str]]:
         """Parse the agent response to extract answer, sources, and tools used."""
                 # Extract sources from observations
                 if hasattr(step, 'observations') and step.observations:
+                    print(f"DEBUG: Processing observations: {step.observations[:500]}...")
                     # Look for complete document blocks with all content
                     document_pattern = r'<document id="(\d+)">\s*<title>(.*?)</title>\s*<date>(.*?)</date>\s*<contextual_summary>(.*?)</contextual_summary>\s*<marketing_insights>(.*?)</marketing_insights>\s*<content>(.*?)</content>'
                     document_matches = re.findall(document_pattern, step.observations, re.DOTALL)
+                    print(f"DEBUG: Found {len(document_matches)} document matches with full pattern")
                     for doc_id, doc_title, doc_date, contextual_summary, marketing_insights, content in document_matches:
                         # Clean up the basic fields
                         clean_title = doc_title.strip()
                             "key_findings": key_findings,
                             "quotes": quotes
                         })
+                    # Fallback: Look for simpler document patterns if the full pattern didn't match
+                    if not document_matches:
+                        print("DEBUG: Trying fallback document patterns...")
+                        # Pattern 1: Simple document with ID and title
+                        simple_pattern = r'<document id="(\d+)">\s*<title>(.*?)</title>'
+                        simple_matches = re.findall(simple_pattern, step.observations, re.DOTALL)
+                        print(f"DEBUG: Found {len(simple_matches)} simple document matches")
+                        for doc_id, doc_title in simple_matches:
+                            sources.append({
+                                "id": doc_id,
+                                "title": doc_title.strip(),
+                                "date": "",
+                                "summary": "",
+                                "key_findings": [],
+                                "quotes": []
+                            })
+                        # Pattern 2: Look for conversation IDs in the content
+                        conv_id_pattern = r'conversation[_\s]*id[:\s]*(\d+)'
+                        conv_id_matches = re.findall(conv_id_pattern, step.observations, re.IGNORECASE)
+                        print(f"DEBUG: Found {len(conv_id_matches)} conversation ID matches: {conv_id_matches}")
+                        for conv_id in conv_id_matches:
+                            sources.append({
+                                "id": conv_id,
+                                "title": f"Conversation {conv_id}",
+                                "date": "",
+                                "summary": "",
+                                "key_findings": [],
+                                "quotes": []
+                            })
         # Fallback: Try to extract from result string if no logs provided
         if not agent_logs:
     def format_sources(self, sources: List[Dict]) -> str:
         """Format the sources with rich information including key findings and marketing insights."""
         if not sources:
+            return "<div><p>No sources found.</p></div>"
+        sources_html = "<div>"
         for i, source in enumerate(sources, 1):
             title = source.get("title", "Unknown")
     def format_tools(self, tools_used: List[str]) -> str:
         """Format the tools used with proper HTML structure."""
         if not tools_used:
+            return "<div><p>No tools used.</p></div>"
+        tools_html = "<div>"
         for tool in tools_used:
             tools_html += f"""
         tools_html += "</div>"
         return tools_html
+    def load_conversations(self, limit: int = 50) -> pd.DataFrame:
+        """Load conversations from MongoDB and format for display."""
+        if self.conversation_collection is None:
+            return pd.DataFrame(columns=["Conversation ID", "Customer Info", "Summary", "Key Findings", "Follow-up Email"])
+        try:
+            # Query for documents with conversation_analysis
+            pipeline = [
+                {"$match": {"conversation_analysis": {"$exists": True}}},
+                {"$limit": limit},
+                {"$project": {
+                    "conversation_id": "$metadata.properties.conversation_id",
+                    "user_id": "$metadata.properties.user_id",
+                    "icp_region": "$metadata.properties.icp_region",
+                    "icp_country": "$metadata.properties.icp_country",
+                    "team_size": "$metadata.properties.team_size",
+                    "summary": "$conversation_analysis.aggregated_contextual_summary",
+                    "key_findings": "$conversation_analysis.aggregated_marketing_insights.key_findings",
+                    "follow_up_email": "$conversation_analysis.follow_up_email"
+                }}
+            ]
+            docs = list(self.conversation_collection.aggregate(pipeline))
+            data = []
+            for doc in docs:
+                conversation_id = doc.get("conversation_id", "Unknown")
+                user_id = doc.get("user_id", "N/A")
+                icp_region = doc.get("icp_region", "N/A")
+                icp_country = doc.get("icp_country", "N/A")
+                team_size = doc.get("team_size", "N/A")
+                summary = doc.get("summary", "No summary available")
+                follow_up_email = doc.get("follow_up_email", "No follow-up email available")
+                # Format customer info into a single column
+                customer_info_parts = []
+                if user_id != "N/A":
+                    customer_info_parts.append(f"User: {user_id}")
+                if icp_region != "N/A":
+                    customer_info_parts.append(f"Region: {icp_region}")
+                if icp_country != "N/A":
+                    customer_info_parts.append(f"Country: {icp_country}")
+                if team_size != "N/A":
+                    customer_info_parts.append(f"Team Size: {team_size}")
+                customer_info = "\n".join(customer_info_parts) if customer_info_parts else "No customer info available"
+                # Format key findings
+                key_findings = doc.get("key_findings", [])
+                if key_findings and isinstance(key_findings, list):
+                    findings_text = "\n".join([f"• {finding.get('finding', '')}" for finding in key_findings[:3]])  # Limit to 3 findings
+                    if len(key_findings) > 3:
+                        findings_text += f"\n... and {len(key_findings) - 3} more"
+                else:
+                    findings_text = "No key findings available"
+                # Truncate summary for table display
+                if len(summary) > 200:
+                    summary = summary[:200] + "..."
+                # Truncate follow-up email for table display
+                if len(follow_up_email) > 150:
+                    follow_up_email = follow_up_email[:150] + "..."
+                data.append({
+                    "Conversation ID": conversation_id,
+                    "Customer Info": customer_info,
+                    "Summary": summary,
+                    "Key Findings": findings_text,
+                    "Follow-up Email": follow_up_email
+                })
+            return pd.DataFrame(data)
+        except Exception as e:
+            print(f"Error loading conversations: {e}")
+            return pd.DataFrame(columns=["Conversation ID", "Customer Info", "Summary", "Key Findings", "Follow-up Email"])
+    def filter_conversations_by_sources(self, sources: List[Dict]) -> pd.DataFrame:
+        """Filter conversations to show only those used in the current query."""
+        if not sources or self.conversation_collection is None:
+            return self.load_conversations()
+        try:
+            # Extract conversation IDs from sources
+            source_conversation_ids = set()
+            print(f"DEBUG: Filtering conversations based on {len(sources)} sources")
+            for source in sources:
+                print(f"DEBUG: Processing source: {source}")
+                # Try to extract conversation ID from various possible fields
+                doc_id = source.get("id", "")
+                title = source.get("title", "")
+                # Method 1: Try to extract conversation ID from title (if it contains conversation ID)
+                if title and "conversation" in title.lower():
+                    # Look for conversation ID pattern in title
+                    import re
+                    conv_id_match = re.search(r'conversation[_\s]*(\d+)', title, re.IGNORECASE)
+                    if conv_id_match:
+                        conv_id = conv_id_match.group(1)
+                        source_conversation_ids.add(conv_id)
+                        print(f"DEBUG: Found conversation ID from title: {conv_id}")
+                        continue
+                # Method 2: Query the RAG collection to find the conversation ID for this document
+                if doc_id:
+                    try:
+                        # Use the correct collection name for RAG data
+                        rag_collection = self.database["rag_conversations"]
+                        # Try different query patterns
+                        doc = None
+                        # Try by _id if it's a valid ObjectId
+                        if doc_id.isdigit():
+                            doc = rag_collection.find_one({"_id": int(doc_id)})
+                        if not doc:
+                            # Try by properties.conversation_id
+                            doc = rag_collection.find_one({"properties.conversation_id": doc_id})
+                        if not doc:
+                            # Try by conversation_id in properties
+                            doc = rag_collection.find_one({"properties.conversation_id": str(doc_id)})
+                        if doc and "properties" in doc and "conversation_id" in doc["properties"]:
+                            conv_id = doc["properties"]["conversation_id"]
+                            if conv_id:
+                                source_conversation_ids.add(str(conv_id))
+                                print(f"DEBUG: Found conversation ID from RAG query: {conv_id}")
+                        else:
+                            print(f"DEBUG: No conversation ID found for doc_id: {doc_id}")
+                    except Exception as e:
+                        print(f"DEBUG: Error querying RAG collection for doc_id {doc_id}: {e}")
+            print(f"DEBUG: Found {len(source_conversation_ids)} unique conversation IDs: {source_conversation_ids}")
+            if not source_conversation_ids:
+                print("DEBUG: No conversation IDs found, returning all conversations")
+                return self.load_conversations()
+            # Query for conversations that match the source conversation IDs
+            pipeline = [
+                {"$match": {
+                    "conversation_analysis": {"$exists": True},
+                    "metadata.properties.conversation_id": {"$in": list(source_conversation_ids)}
+                }},
+                {"$project": {
+                    "conversation_id": "$metadata.properties.conversation_id",
+                    "user_id": "$metadata.properties.user_id",
+                    "icp_region": "$metadata.properties.icp_region",
+                    "icp_country": "$metadata.properties.icp_country",
+                    "team_size": "$metadata.properties.team_size",
+                    "summary": "$conversation_analysis.aggregated_contextual_summary",
+                    "key_findings": "$conversation_analysis.aggregated_marketing_insights.key_findings",
+                    "follow_up_email": "$conversation_analysis.follow_up_email"
+                }}
+            ]
+            docs = list(self.conversation_collection.aggregate(pipeline))
+            print(f"DEBUG: Found {len(docs)} matching conversation documents")
+            data = []
+            for doc in docs:
+                conversation_id = doc.get("conversation_id", "Unknown")
+                user_id = doc.get("user_id", "N/A")
+                icp_region = doc.get("icp_region", "N/A")
+                icp_country = doc.get("icp_country", "N/A")
+                team_size = doc.get("team_size", "N/A")
+                summary = doc.get("summary", "No summary available")
+                follow_up_email = doc.get("follow_up_email", "No follow-up email available")
+                # Format customer info into a single column
+                customer_info_parts = []
+                if user_id != "N/A":
+                    customer_info_parts.append(f"User: {user_id}")
+                if icp_region != "N/A":
+                    customer_info_parts.append(f"Region: {icp_region}")
+                if icp_country != "N/A":
+                    customer_info_parts.append(f"Country: {icp_country}")
+                if team_size != "N/A":
+                    customer_info_parts.append(f"Team Size: {team_size}")
+                customer_info = "\n".join(customer_info_parts) if customer_info_parts else "No customer info available"
+                # Format key findings
+                key_findings = doc.get("key_findings", [])
+                if key_findings and isinstance(key_findings, list):
+                    findings_text = "\n".join([f"• {finding.get('finding', '')}" for finding in key_findings[:3]])
+                    if len(key_findings) > 3:
+                        findings_text += f"\n... and {len(key_findings) - 3} more"
+                else:
+                    findings_text = "No key findings available"
+                # Truncate summary for table display
+                if len(summary) > 200:
+                    summary = summary[:200] + "..."
+                # Truncate follow-up email for table display
+                if len(follow_up_email) > 150:
+                    follow_up_email = follow_up_email[:150] + "..."
+                data.append({
+                    "Conversation ID": conversation_id,
+                    "Customer Info": customer_info,
+                    "Summary": summary,
+                    "Key Findings": findings_text,
+                    "Follow-up Email": follow_up_email
+                })
+            print(f"DEBUG: Returning {len(data)} filtered conversations")
+            return pd.DataFrame(data)
+        except Exception as e:
+            print(f"Error filtering conversations: {e}")
+            import traceback
+            traceback.print_exc()
+            return self.load_conversations()
+    def filter_conversations(self, search_term: str) -> pd.DataFrame:
+        """Filter conversations based on search term."""
+        if not search_term or not search_term.strip():
+            return self.load_conversations()
+        try:
+            # Load all conversations first
+            all_conversations = self.load_conversations(limit=1000)  # Load more for filtering
+            if all_conversations.empty:
+                return all_conversations
+            # Convert search term to lowercase for case-insensitive search
+            search_lower = search_term.lower().strip()
+            # Filter conversations based on search term
+            filtered_data = []
+            for _, row in all_conversations.iterrows():
+                # Search in conversation ID, customer info, summary, key findings, and follow-up email
+                conversation_id = str(row.get("Conversation ID", "")).lower()
+                customer_info = str(row.get("Customer Info", "")).lower()
+                summary = str(row.get("Summary", "")).lower()
+                key_findings = str(row.get("Key Findings", "")).lower()
+                follow_up_email = str(row.get("Follow-up Email", "")).lower()
+                # Check if search term matches any field
+                if (search_lower in conversation_id or
+                    search_lower in customer_info or
+                    search_lower in summary or
+                    search_lower in key_findings or
+                    search_lower in follow_up_email):
+                    filtered_data.append(row.to_dict())
+            return pd.DataFrame(filtered_data)
+        except Exception as e:
+            print(f"Error filtering conversations: {e}")
+            return self.load_conversations()
+    def clear_conversation_search(self) -> Tuple[str, pd.DataFrame]:
+        """Clear the search and show all conversations."""
+        return "", self.load_conversations()
+    def reset_ui_state(self) -> Tuple[str, str, str, str, pd.DataFrame]:
+        """Reset the UI state to show all conversations and clear outputs."""
+        return "", "", "", "", self.load_conversations()
     def launch(self, **kwargs):
         """Launch the Gradio interface."""
         return self.interface.launch(**kwargs)

src/second_brain_online/config.py CHANGED Viewed

@@ -44,11 +44,12 @@ class Settings(BaseSettings):
         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
-        default="rag_intercom",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
-        default="mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/?retryWrites=true&w=majority&appName=saaslabs",
         description="Connection URI for the MongoDB Atlas instance.",
     )

         description="Name of the MongoDB database.",
     )
     MONGODB_COLLECTION_NAME: str = Field(
+        default="rag_conversations",
         description="Name of the MongoDB collection for RAG documents.",
     )
     MONGODB_URI: str = Field(
+        default="mongodb+srv://keshavchhaparia:bUSBXeVCGWDyQhDG@saaslabs.awtivxf.mongodb.net/",
+        # default="mongodb+srv://contextdb:HOqIgSH01CoEiMb1@cluster0.d9cmff.mongodb.net/",
         description="Connection URI for the MongoDB Atlas instance.",
     )

temp_init.py DELETED Viewed

@@ -1,9 +0,0 @@
-from . import agents, rag
-# Optional import for evaluation - may cause issues in some environments
-try:
-    from .evaluation import evaluate
-    __all__ = ["rag", "agents", "evaluate"]
-except ImportError as e:
-    print(f"Warning: Could not import evaluation module: {e}")
-    __all__ = ["rag", "agents"]

uv.lock CHANGED Viewed

@@ -2344,6 +2344,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymongo" },
     { name = "smolagents" },
 ]
@@ -2369,6 +2370,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.8.2" },
     { name = "pydantic-settings", specifier = ">=2.7.0" },
     { name = "pymongo", specifier = ">=4.10.1" },
     { name = "smolagents", specifier = ">=1.4.1" },
 ]

     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymongo" },
+    { name = "sentence-transformers" },
     { name = "smolagents" },
 ]
     { name = "pydantic", specifier = ">=2.8.2" },
     { name = "pydantic-settings", specifier = ">=2.7.0" },
     { name = "pymongo", specifier = ">=4.10.1" },
+    { name = "sentence-transformers", specifier = ">=3.0.0" },
     { name = "smolagents", specifier = ">=1.4.1" },
 ]

what_can_i_do.py DELETED Viewed

@@ -1,60 +0,0 @@
-import opik
-from smolagents import Tool
-class WhatCanIDoTool(Tool):
-    name = "what_can_i_do"
-    description = """Returns a comprehensive list of available capabilities and topics in the Second Brain system.
-    This tool should be used when:
-    - The user explicitly asks what the system can do
-    - The user asks about available features or capabilities
-    - The user seems unsure about what questions they can ask
-    - The user wants to explore the system's knowledge areas
-    This tool should NOT be used when:
-    - The user asks a specific technical question
-    - The user already knows what they want to learn about
-    - The question is about a specific topic covered in the knowledge base"""
-    inputs = {
-        "question": {
-            "type": "string",
-            "description": "The user's query about system capabilities. While this parameter is required, the function returns a standard capability list regardless of the specific question."
-        }
-    }
-    output_type = "string"
-    @opik.track(name="what_can_i_do")
-    def forward(self, question: str) -> str:
-        """Returns a comprehensive list of available capabilities and topics in the Second Brain system."""
-        return """
-You can ask questions about the content in your Second Brain, such as:
-Architecture and Systems:
-- What is the feature/training/inference (FTI) architecture?
-- How do agentic systems work?
-- Detail how does agent memory work in agentic applications?
-LLM Technology:
-- What are LLMs?
-- What is BERT (Bidirectional Encoder Representations from Transformers)?
-- Detail how does RLHF (Reinforcement Learning from Human Feedback) work?
-- What are the top LLM frameworks for building applications?
-- Write me a paragraph on how can I optimize LLMs during inference?
-RAG and Document Processing:
-- What tools are available for processing PDFs for LLMs and RAG?
-- What's the difference between vector databases and vector indices?
-- How does document chunk overlap affect RAG performance?
-- What is chunk reranking and why is it important?
-- What are advanced RAG techniques for optimization?
-- How can RAG pipelines be evaluated?
-Learning Resources:
-- Can you recommend courses on LLMs and RAG?
-"""
-# Create an instance for backward compatibility
-what_can_i_do = WhatCanIDoTool()