Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

akryldigital commited on 10 days ago

Commit

92633a7

verified ·

1 Parent(s): 26449fc

Pilot (#2)

Browse files

- add src (f5df98319255a7ba942909ae0e12791d2a5e78e4)
- update reqs (85f1ebc529a9b7f1f4a999dc8843b337bc89beb2)
- add single smart chatbot (aafcd0db8782557c9e599432d751e03b1f373c0f)
- add multi-agent system (caeff10c241774d0831c3aa69fd71a546afe5964)
- add utils (fab49c5c2eb911969d7d515898d3c8ce8a459178)
- create UI (ce77124199dfb6e08e9e7bcc82276dc965626947)
- adjust Dockerfile accordingly (87edf9841faa8a7bdc563ff4ae6f3e5c0306ea9a)

Files changed (28) hide show

Dockerfile +14 -5
app.py +694 -0
multi_agent_chatbot.py +1167 -0
requirements.txt +9 -3
smart_chatbot.py +1098 -0
src/__init__.py +10 -0
src/config/__init__.py +5 -0
src/config/collections.json +22 -0
src/config/loader.py +170 -0
src/config/settings.yaml +92 -0
src/llm/__init__.py +6 -0
src/llm/adapters.py +409 -0
src/llm/templates.py +232 -0
src/loader.py +115 -0
src/logging.py +193 -0
src/pipeline.py +731 -0
src/reporting/__init__.py +6 -0
src/reporting/feedback_schema.py +196 -0
src/reporting/metadata.py +216 -0
src/reporting/service.py +144 -0
src/reporting/snowflake_connector.py +305 -0
src/retrieval/__init__.py +15 -0
src/retrieval/colbert_cache.py +74 -0
src/retrieval/context.py +881 -0
src/retrieval/filter.py +975 -0
src/retrieval/hybrid.py +479 -0
src/vectorstore.py +266 -0
utils.py +163 -0

Dockerfile CHANGED Viewed

@@ -1,20 +1,29 @@
-FROM python:3.13.5-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
 EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (for better Docker layer caching)
 COPY requirements.txt ./
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy all application files (excluding .dockerignore patterns)
+COPY . .
+# Expose Streamlit port (HF Spaces maps to 7860 automatically)
 EXPOSE 8501
+# Health check for Streamlit
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+# Run Streamlit app
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless", "true"]

app.py ADDED Viewed

	@@ -0,0 +1,694 @@

+"""
+Intelligent Audit Report Chatbot UI
+"""
+import os
+import sys
+import time
+import json
+import uuid
+import logging
+from pathlib import Path
+import argparse
+import streamlit as st
+from langchain_core.messages import HumanMessage, AIMessage
+from multi_agent_chatbot import get_multi_agent_chatbot
+from smart_chatbot import get_chatbot as get_smart_chatbot
+from src.reporting.feedback_schema import create_feedback_from_dict
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Page config
+st.set_page_config(
+    layout="wide",
+    page_icon="🤖",
+    initial_sidebar_state="expanded",
+    page_title="Intelligent Audit Report Chatbot"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 1rem;
+    }
+    .subtitle {
+        font-size: 1.2rem;
+        color: #666;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .session-info {
+        background-color: #f0f2f6;
+        padding: 10px;
+        border-radius: 5px;
+        margin-bottom: 20px;
+        font-size: 0.9rem;
+    }
+    .user-message {
+        background-color: #007bff;
+        color: white;
+        padding: 12px 16px;
+        border-radius: 18px 18px 4px 18px;
+        margin: 8px 0;
+        margin-left: 20%;
+        word-wrap: break-word;
+    }
+    .bot-message {
+        background-color: #f1f3f4;
+        color: #333;
+        padding: 12px 16px;
+        border-radius: 18px 18px 18px 4px;
+        margin: 8px 0;
+        margin-right: 20%;
+        word-wrap: break-word;
+        border: 1px solid #e0e0e0;
+    }
+    .filter-section {
+        margin-bottom: 20px;
+        padding: 15px;
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        border: 1px solid #e9ecef;
+    }
+    .filter-title {
+        font-weight: bold;
+        margin-bottom: 10px;
+        color: #495057;
+    }
+    .feedback-section {
+        background-color: #f8f9fa;
+        padding: 20px;
+        border-radius: 10px;
+        margin-top: 30px;
+        border: 2px solid #dee2e6;
+    }
+    .retrieval-history {
+        background-color: #ffffff;
+        padding: 15px;
+        border-radius: 5px;
+        margin: 10px 0;
+        border-left: 4px solid #007bff;
+    }
+</style>
+""", unsafe_allow_html=True)
+def get_system_type():
+    """Get the current system type"""
+    system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent')
+    if system == 'smart':
+        return "Smart Chatbot System"
+    else:
+        return "Multi-Agent System"
+def get_chatbot():
+    """Initialize and return the chatbot based on system type"""
+    # Check environment variable for system type
+    system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent')
+    if system == 'smart':
+        return get_smart_chatbot()
+    else:
+        return get_multi_agent_chatbot()
+def serialize_messages(messages):
+    """Serialize LangChain messages to dictionaries"""
+    serialized = []
+    for msg in messages:
+        if hasattr(msg, 'content'):
+            serialized.append({
+                "type": type(msg).__name__,
+                "content": str(msg.content)
+            })
+    return serialized
+def serialize_documents(sources):
+    """Serialize document objects to dictionaries with deduplication"""
+    serialized = []
+    seen_content = set()
+    for doc in sources:
+        content = getattr(doc, 'page_content', getattr(doc, 'content', ''))
+        # Skip if we've seen this exact content before
+        if content in seen_content:
+            continue
+        seen_content.add(content)
+        doc_dict = {
+            "content": content,
+            "metadata": getattr(doc, 'metadata', {}),
+            "score": getattr(doc, 'metadata', {}).get('reranked_score', getattr(doc, 'metadata', {}).get('original_score', 0.0)),
+            "id": getattr(doc, 'metadata', {}).get('_id', 'unknown'),
+            "source": getattr(doc, 'metadata', {}).get('source', 'unknown'),
+            "year": getattr(doc, 'metadata', {}).get('year', 'unknown'),
+            "district": getattr(doc, 'metadata', {}).get('district', 'unknown'),
+            "page": getattr(doc, 'metadata', {}).get('page', 'unknown'),
+            "chunk_id": getattr(doc, 'metadata', {}).get('chunk_id', 'unknown'),
+            "page_label": getattr(doc, 'metadata', {}).get('page_label', 'unknown'),
+            "original_score": getattr(doc, 'metadata', {}).get('original_score', 0.0),
+            "reranked_score": getattr(doc, 'metadata', {}).get('reranked_score', None)
+        }
+        serialized.append(doc_dict)
+    return serialized
+@st.cache_data
+def load_filter_options():
+    try:
+        with open("filter_options.json", "r") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        st.info([x for x in os.listdir() if x.endswith('.json')])
+        st.error("filter_options.json not found. Please run the metadata analysis script.")
+        return {"sources": [], "years": [], "districts": [], 'filenames': []}
+def main():
+    # Initialize session state
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    if 'conversation_id' not in st.session_state:
+        st.session_state.conversation_id = f"session_{uuid.uuid4().hex[:8]}"
+    if 'session_start_time' not in st.session_state:
+        st.session_state.session_start_time = time.time()
+    if 'active_filters' not in st.session_state:
+        st.session_state.active_filters = {'sources': [], 'years': [], 'districts': [], 'filenames': []}
+    # Track RAG retrieval history for feedback
+    if 'rag_retrieval_history' not in st.session_state:
+        st.session_state.rag_retrieval_history = []
+    # Initialize chatbot only once per app session (cached)
+    if 'chatbot' not in st.session_state:
+        with st.spinner("🔄 Loading AI models and connecting to database..."):
+            st.session_state.chatbot = get_chatbot()
+        st.success("✅ AI system ready!")
+    # Reset conversation history if needed (but keep chatbot cached)
+    if 'reset_conversation' in st.session_state and st.session_state.reset_conversation:
+        st.session_state.messages = []
+        st.session_state.conversation_id = f"session_{uuid.uuid4().hex[:8]}"
+        st.session_state.session_start_time = time.time()
+        st.session_state.rag_retrieval_history = []
+        st.session_state.feedback_submitted = False
+        st.session_state.reset_conversation = False
+        st.rerun()
+    # Header with system indicator
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.markdown('<h1 class="main-header">🤖 Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
+    with col2:
+        system_type = get_system_type()
+        if "Multi-Agent" in system_type:
+            st.success(f"🔧 {system_type}")
+        else:
+            st.info(f"🔧 {system_type}")
+    st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
+    # Session info
+    duration = int(time.time() - st.session_state.session_start_time)
+    duration_str = f"{duration // 60}m {duration % 60}s"
+    st.markdown(f'''
+    <div class="session-info">
+        <strong>Session Info:</strong> Messages: {len(st.session_state.messages)} | Duration: {duration_str} | Status: Active | ID: {st.session_state.conversation_id}
+    </div>
+    ''', unsafe_allow_html=True)
+    # Load filter options
+    filter_options = load_filter_options()
+    # Sidebar for filters
+    with st.sidebar:
+        st.markdown("### 🔍 Search Filters")
+        st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
+        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
+        st.markdown('<div class="filter-title">📄 Specific Reports (Filename Filter)</div>', unsafe_allow_html=True)
+        st.markdown('<p style="font-size: 0.85em; color: #666;">⚠️ Selecting specific reports will ignore all other filters</p>', unsafe_allow_html=True)
+        selected_filenames = st.multiselect(
+                "Select specific reports:",
+                options=filter_options.get('filenames', []),
+                default=st.session_state.active_filters.get('filenames', []),
+                key="filenames_filter",
+                help="Choose specific reports to search. When enabled, all other filters are ignored."
+            )
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Determine if filename filter is active
+        filename_mode = len(selected_filenames) > 0
+        # Sources filter
+        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
+        st.markdown('<div class="filter-title">📊 Sources</div>', unsafe_allow_html=True)
+        selected_sources = st.multiselect(
+            "Select sources:",
+            options=filter_options['sources'],
+            default=st.session_state.active_filters['sources'],
+            disabled = filename_mode,
+            key="sources_filter",
+            help="Choose which types of reports to search"
+        )
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Years filter
+        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
+        st.markdown('<div class="filter-title">📅 Years</div>', unsafe_allow_html=True)
+        selected_years = st.multiselect(
+            "Select years:",
+            options=filter_options['years'],
+            default=st.session_state.active_filters['years'],
+            disabled = filename_mode,
+            key="years_filter",
+            help="Choose which years to search"
+        )
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Districts filter
+        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
+        st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
+        selected_districts = st.multiselect(
+            "Select districts:",
+            options=filter_options['districts'],
+            default=st.session_state.active_filters['districts'],
+            disabled = filename_mode,
+            key="districts_filter",
+            help="Choose which districts to search"
+        )
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Update active filters
+        st.session_state.active_filters = {
+            'sources': selected_sources if not filename_mode else [],
+            'years': selected_years if not filename_mode else [],
+            'districts': selected_districts if not filename_mode else [],
+            'filenames': selected_filenames
+        }
+        # Clear filters button
+        if st.button("🗑️ Clear All Filters", key="clear_filters_button"):
+            st.session_state.active_filters = {'sources': [], 'years': [], 'districts': [], 'filenames': []}
+            st.rerun()
+    # Main content area with tabs
+    tab1, tab2 = st.tabs(["💬 Chat", "📄 Retrieved Documents"])
+    with tab1:
+        # Chat container
+        chat_container = st.container()
+        with chat_container:
+            # Display conversation history
+            for message in st.session_state.messages:
+                if isinstance(message, HumanMessage):
+                    st.markdown(f'<div class="user-message">{message.content}</div>', unsafe_allow_html=True)
+                elif isinstance(message, AIMessage):
+                    st.markdown(f'<div class="bot-message">{message.content}</div>', unsafe_allow_html=True)
+        # Input area
+        st.markdown("<br>", unsafe_allow_html=True)
+        # Create two columns for input and button
+        col1, col2 = st.columns([4, 1])
+        with col1:
+            # Use a counter to force input clearing
+            if 'input_counter' not in st.session_state:
+                st.session_state.input_counter = 0
+            user_input = st.text_input(
+                "Type your message here...",
+                placeholder="Ask about budget allocations, expenditures, or audit findings...",
+                key=f"user_input_{st.session_state.input_counter}",
+                label_visibility="collapsed"
+            )
+        with col2:
+            send_button = st.button("Send", key="send_button", use_container_width=True)
+        # Clear chat button
+        if st.button("🗑️ Clear Chat", key="clear_chat_button"):
+            st.session_state.reset_conversation = True
+            # Clear all conversation files
+            import os
+            conversations_dir = "conversations"
+            if os.path.exists(conversations_dir):
+                for file in os.listdir(conversations_dir):
+                    if file.endswith('.json'):
+                        os.remove(os.path.join(conversations_dir, file))
+            st.rerun()
+        # Handle user input
+        if send_button and user_input:
+            # Construct filter context string
+            filter_context_str = ""
+            if selected_filenames:
+                filter_context_str += "FILTER CONTEXT:\n"
+                filter_context_str += f"Filenames: {', '.join(selected_filenames)}\n"
+                filter_context_str += "USER QUERY:\n"
+            elif selected_sources or selected_years or selected_districts:
+                filter_context_str += "FILTER CONTEXT:\n"
+                if selected_sources:
+                    filter_context_str += f"Sources: {', '.join(selected_sources)}\n"
+                if selected_years:
+                    filter_context_str += f"Years: {', '.join(selected_years)}\n"
+                if selected_districts:
+                    filter_context_str += f"Districts: {', '.join(selected_districts)}\n"
+                filter_context_str += "USER QUERY:\n"
+            full_query = filter_context_str + user_input
+            # Add user message to history
+            st.session_state.messages.append(HumanMessage(content=user_input))
+            # Get chatbot response
+            with st.spinner("🤔 Thinking..."):
+                try:
+                    # Pass the full query with filter context
+                    chat_result = st.session_state.chatbot.chat(full_query, st.session_state.conversation_id)
+                    # Handle both old format (string) and new format (dict)
+                    if isinstance(chat_result, dict):
+                        response = chat_result['response']
+                        rag_result = chat_result.get('rag_result')
+                        st.session_state.last_rag_result = rag_result
+                        # Track RAG retrieval for feedback
+                        if rag_result:
+                            sources = rag_result.get('sources', []) if isinstance(rag_result, dict) else (rag_result.sources if hasattr(rag_result, 'sources') else [])
+                            # Get the actual RAG query
+                            actual_rag_query = chat_result.get('actual_rag_query', '')
+                            if actual_rag_query:
+                                # Format it like the log message
+                                timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                                formatted_query = f"{timestamp} - INFO - 🔍 ACTUAL RAG QUERY: '{actual_rag_query}'"
+                            else:
+                                formatted_query = "No RAG query available"
+                            retrieval_entry = {
+                                "conversation_up_to": serialize_messages(st.session_state.messages),
+                                "rag_query_expansion": formatted_query,
+                                "docs_retrieved": serialize_documents(sources)
+                            }
+                            st.session_state.rag_retrieval_history.append(retrieval_entry)
+                    else:
+                        response = chat_result
+                        st.session_state.last_rag_result = None
+                    # Add bot response to history
+                    st.session_state.messages.append(AIMessage(content=response))
+                except Exception as e:
+                    error_msg = f"Sorry, I encountered an error: {str(e)}"
+                    st.session_state.messages.append(AIMessage(content=error_msg))
+            # Clear input and rerun
+            st.session_state.input_counter += 1  # This will clear the input
+            st.rerun()
+    with tab2:
+        # Document retrieval panel
+        if hasattr(st.session_state, 'last_rag_result') and st.session_state.last_rag_result:
+            rag_result = st.session_state.last_rag_result
+            # Handle both PipelineResult object and dictionary formats
+            sources = None
+            if hasattr(rag_result, 'sources'):
+                # PipelineResult object format
+                sources = rag_result.sources
+            elif isinstance(rag_result, dict) and 'sources' in rag_result:
+                # Dictionary format from multi-agent system
+                sources = rag_result['sources']
+            if sources and len(sources) > 0:
+                # Count unique filenames
+                unique_filenames = set()
+                for doc in sources:
+                    filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
+                    unique_filenames.add(filename)
+                st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 10):**")
+                if len(unique_filenames) < len(sources):
+                    st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
+                for i, doc in enumerate(sources[:10]):  # Show top 10
+                    # Get relevance score and ID if available
+                    metadata = getattr(doc, 'metadata', {})
+                    score = metadata.get('reranked_score', metadata.get('original_score', None))
+                    chunk_id = metadata.get('_id', 'Unknown')
+                    score_text = f" (Score: {score:.3f}, ID: {chunk_id[:8]}...)" if score is not None else f" (ID: {chunk_id[:8]}...)"
+                    with st.expander(f"📄 Document {i+1}: {getattr(doc, 'metadata', {}).get('filename', 'Unknown')[:50]}...{score_text}"):
+                        # Display document metadata with emojis
+                        metadata = getattr(doc, 'metadata', {})
+                        col1, col2, col3, col4 = st.columns([2, 1.5, 1, 1])
+                        with col1:
+                            st.write(f"📄 **File:** {metadata.get('filename', 'Unknown')}")
+                        with col2:
+                            st.write(f"🏛️ **Source:** {metadata.get('source', 'Unknown')}")
+                        with col3:
+                            st.write(f"📅 **Year:** {metadata.get('year', 'Unknown')}")
+                        with col4:
+                            # Display page number and chunk ID
+                            page = metadata.get('page_label', metadata.get('page', 'Unknown'))
+                            chunk_id = metadata.get('_id', 'Unknown')
+                            st.write(f"📖 **Page:** {page}")
+                            st.write(f"🆔 **ID:** {chunk_id}")
+                        # Display full content (no truncation)
+                        content = getattr(doc, 'page_content', 'No content available')
+                        st.write(f"**Full Content:**")
+                        st.text_area("Full Content", value=content, height=300, disabled=True, label_visibility="collapsed", key=f"preview_{i}")
+            else:
+                st.info("No documents were retrieved for the last query.")
+        else:
+            st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
+    # Feedback Dashboard Section
+    st.markdown("---")
+    st.markdown("### 💬 Feedback Dashboard")
+    # Check if there's any conversation to provide feedback on
+    has_conversation = len(st.session_state.messages) > 0
+    has_retrievals = len(st.session_state.rag_retrieval_history) > 0
+    if not has_conversation:
+        st.info("💡 Start a conversation to provide feedback!")
+        st.markdown("The feedback dashboard will be enabled once you begin chatting.")
+    else:
+        st.markdown("Help us improve by providing feedback on this conversation.")
+        # Initialize feedback state if not exists
+        if 'feedback_submitted' not in st.session_state:
+            st.session_state.feedback_submitted = False
+        # Feedback form
+        with st.form("feedback_form", clear_on_submit=False):
+            col1, col2 = st.columns([1, 1])
+            with col1:
+                feedback_score = st.slider(
+                    "Rate this conversation (1-5)",
+                    min_value=1,
+                    max_value=5,
+                    help="How satisfied are you with the conversation?"
+                )
+            with col2:
+                is_feedback_about_last_retrieval = st.checkbox(
+                    "Feedback about last retrieval only",
+                    value=True,
+                    help="If checked, feedback applies to the most recent document retrieval"
+                )
+            open_ended_feedback = st.text_area(
+                "Your feedback (optional)",
+                placeholder="Tell us what went well or what could be improved...",
+                height=100
+            )
+            # Disable submit if no score selected
+            submit_disabled = feedback_score is None
+            submitted = st.form_submit_button(
+                "📤 Submit Feedback",
+                use_container_width=True,
+                disabled=submit_disabled
+            )
+            if submitted and not st.session_state.feedback_submitted:
+                # Log the feedback data being submitted
+                print("=" * 80)
+                print("🔄 FEEDBACK SUBMISSION: Starting...")
+                print("=" * 80)
+                st.write("🔍 **Debug: Feedback Data Being Submitted:**")
+                # Create feedback data dictionary
+                feedback_dict = {
+                    "open_ended_feedback": open_ended_feedback,
+                    "score": feedback_score,
+                    "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
+                    "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
+                    "conversation_id": st.session_state.conversation_id,
+                    "timestamp": time.time(),
+                    "message_count": len(st.session_state.messages),
+                    "has_retrievals": has_retrievals,
+                    "retrieval_count": len(st.session_state.rag_retrieval_history)
+                }
+                print(f"📝 FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
+                # Create UserFeedback dataclass instance
+                feedback_obj = None  # Initialize outside try block
+                try:
+                    feedback_obj = create_feedback_from_dict(feedback_dict)
+                    print(f"✅ FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
+                    st.write(f"✅ **Feedback Object Created**")
+                    st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
+                    st.write(f"- Score: {feedback_obj.score}/5")
+                    st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
+                    # Convert back to dict for JSON serialization
+                    feedback_data = feedback_obj.to_dict()
+                except Exception as e:
+                    print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
+                    st.error(f"Failed to create feedback object: {e}")
+                    feedback_data = feedback_dict
+                # Display the data being submitted
+                st.json(feedback_data)
+                # Save feedback to file
+                feedback_dir = Path("feedback")
+                feedback_dir.mkdir(exist_ok=True)
+                feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
+                try:
+                    # Save to local file
+                    print(f"💾 FEEDBACK SAVE: Saving to local file: {feedback_file}")
+                    with open(feedback_file, 'w') as f:
+                        json.dump(feedback_data, f, indent=2, default=str)
+                    print(f"✅ FEEDBACK SAVE: Local file saved successfully")
+                    st.success("✅ Thank you for your feedback! It has been saved locally.")
+                    st.balloons()
+                    # Save to Snowflake if enabled and credentials available
+                    logger.info("🔄 FEEDBACK SAVE: Starting Snowflake save process...")
+                    logger.info(f"📊 FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
+                    try:
+                        import os
+                        snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
+                        logger.info(f"🔍 SNOWFLAKE CHECK: enabled={snowflake_enabled}")
+                        if snowflake_enabled:
+                            if feedback_obj:
+                                try:
+                                    from auditqa.reporting.snowflake_connector import save_to_snowflake
+                                    logger.info("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
+                                    print("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")  # Also print to terminal
+                                    if save_to_snowflake(feedback_obj):
+                                        logger.info("✅ SNOWFLAKE UI: Successfully saved to Snowflake")
+                                        print("✅ SNOWFLAKE UI: Successfully saved to Snowflake")  # Also print to terminal
+                                        st.success("✅ Feedback also saved to Snowflake!")
+                                    else:
+                                        logger.warning("⚠️ SNOWFLAKE UI: Save failed")
+                                        print("⚠️ SNOWFLAKE UI: Save failed")  # Also print to terminal
+                                        st.warning("⚠️ Snowflake save failed, but local save succeeded")
+                                except Exception as e:
+                                    logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
+                                    print(f"❌ SNOWFLAKE UI ERROR: {e}")  # Also print to terminal
+                                    import traceback
+                                    traceback.print_exc()  # Print full traceback to terminal
+                                    st.warning(f"⚠️ Could not save to Snowflake: {e}")
+                            else:
+                                logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
+                                print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")  # Also print to terminal
+                                st.warning("⚠️ Skipping Snowflake save (feedback object not created)")
+                        else:
+                            logger.info("💡 SNOWFLAKE UI: Integration disabled")
+                            print("💡 SNOWFLAKE UI: Integration disabled")  # Also print to terminal
+                            st.info("💡 Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
+                    except NameError as e:
+                        import traceback
+                        traceback.print_exc()
+                        logger.error(f"❌ NameError in Snowflake save: {e}")
+                        print(f"❌ NameError in Snowflake save: {e}")  # Also print to terminal
+                        st.warning(f"⚠️ Snowflake save error: {e}")
+                    except Exception as e:
+                        logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
+                        print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")  # Also print to terminal
+                        st.warning(f"⚠️ Snowflake save error: {e}")
+                    # Mark feedback as submitted to prevent resubmission
+                    st.session_state.feedback_submitted = True
+                    print("=" * 80)
+                    print(f"✅ FEEDBACK SUBMISSION: Completed successfully")
+                    print("=" * 80)
+                    # Log file location
+                    st.info(f"📁 Feedback saved to: {feedback_file}")
+                except Exception as e:
+                    print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}")
+                    print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
+                    import traceback
+                    traceback.print_exc()
+                    st.error(f"❌ Error saving feedback: {e}")
+                    st.write(f"Debug error: {str(e)}")
+            elif st.session_state.feedback_submitted:
+                st.success("✅ Feedback already submitted for this conversation!")
+                if st.button("🔄 Submit New Feedback", key="new_feedback_button"):
+                    st.session_state.feedback_submitted = False
+                    st.rerun()
+    # Display retrieval history stats
+    if st.session_state.rag_retrieval_history:
+        st.markdown("---")
+        st.markdown("#### 📊 Retrieval History")
+        with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
+            for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
+                st.markdown(f"**Retrieval #{idx}**")
+                # Display the actual RAG query
+                rag_query_expansion = entry.get("rag_query_expansion", "No query available")
+                st.code(rag_query_expansion, language="text")
+                # Display summary stats
+                st.json({
+                    "conversation_length": len(entry.get("conversation_up_to", [])),
+                    "documents_retrieved": len(entry.get("docs_retrieved", []))
+                })
+                st.markdown("---")
+    # Auto-scroll to bottom
+    st.markdown("""
+    <script>
+        window.scrollTo(0, document.body.scrollHeight);
+    </script>
+    """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

multi_agent_chatbot.py ADDED Viewed

	@@ -0,0 +1,1167 @@

+"""
+Multi-Agent RAG Chatbot using LangGraph
+This system implements a 3-agent architecture:
+1. Main Agent: Handles conversation flow, follow-ups, and determines when to call RAG
+2. RAG Agent: Rewrites queries and applies filters for document retrieval
+3. Response Agent: Generates final answers from retrieved documents
+Each agent has specialized prompts and responsibilities.
+"""
+import os
+import json
+import time
+import logging
+from pathlib import Path
+from datetime import datetime
+from dataclasses import dataclass
+from typing import Dict, List, Any, Optional, TypedDict
+import re
+from langchain_core.tools import tool
+from langgraph.graph import StateGraph, END
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from langchain_core.prompts import ChatPromptTemplate
+from src.pipeline import PipelineManager
+from src.config.loader import load_config
+from src.llm.adapters import get_llm_client
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+@dataclass
+class QueryContext:
+    """Context extracted from conversation"""
+    has_district: bool = False
+    has_source: bool = False
+    has_year: bool = False
+    extracted_district: Optional[str] = None
+    extracted_source: Optional[str] = None
+    extracted_year: Optional[str] = None
+    ui_filters: Dict[str, List[str]] = None
+    confidence_score: float = 0.0
+    needs_follow_up: bool = False
+    follow_up_question: Optional[str] = None
+class MultiAgentState(TypedDict):
+    """State for the multi-agent conversation flow"""
+    conversation_id: str
+    messages: List[Any]
+    current_query: str
+    query_context: Optional[QueryContext]
+    rag_query: Optional[str]
+    rag_filters: Optional[Dict[str, Any]]
+    retrieved_documents: Optional[List[Any]]
+    final_response: Optional[str]
+    agent_logs: List[str]
+    conversation_context: Dict[str, Any]
+    session_start_time: float
+    last_ai_message_time: float
+class MultiAgentRAGChatbot:
+    """Multi-agent RAG chatbot with specialized agents"""
+    def __init__(self, config_path: str = "auditqa/config/settings.yaml"):
+        """Initialize the multi-agent chatbot"""
+        self.config = load_config(config_path)
+        # Get LLM provider from config
+        reader_config = self.config.get("reader", {})
+        default_type = reader_config.get("default_type", "INF_PROVIDERS")
+        provider_name = default_type.lower()
+        self.llm_adapter = get_llm_client(provider_name, self.config)
+        # Create a simple wrapper for LangChain compatibility
+        class LLMWrapper:
+            def __init__(self, adapter):
+                self.adapter = adapter
+            def invoke(self, messages):
+                # Convert LangChain messages to the format expected by the adapter
+                if isinstance(messages, list):
+                    formatted_messages = []
+                    for msg in messages:
+                        if hasattr(msg, 'content'):
+                            role = "user" if msg.__class__.__name__ == "HumanMessage" else "assistant"
+                            formatted_messages.append({"role": role, "content": msg.content})
+                        else:
+                            formatted_messages.append({"role": "user", "content": str(msg)})
+                else:
+                    formatted_messages = [{"role": "user", "content": str(messages)}]
+                # Use the adapter to get response
+                response = self.adapter.generate(formatted_messages)
+                # Return a mock response object
+                class MockResponse:
+                    def __init__(self, content):
+                        self.content = content
+                return MockResponse(response.content)
+        self.llm = LLMWrapper(self.llm_adapter)
+        # Initialize pipeline manager early to load models
+        logger.info("🔄 Initializing pipeline manager and loading models...")
+        self.pipeline_manager = PipelineManager(self.config)
+        logger.info("✅ Pipeline manager initialized and models loaded")
+        # Connect to vector store
+        logger.info("🔄 Connecting to vector store...")
+        if not self.pipeline_manager.connect_vectorstore():
+            logger.error("❌ Failed to connect to vector store")
+            raise RuntimeError("Vector store connection failed")
+        logger.info("✅ Vector store connected successfully")
+        # Load dynamic data
+        self._load_dynamic_data()
+        # Build the multi-agent graph
+        self.graph = self._build_graph()
+        # Conversations directory
+        self.conversations_dir = Path("conversations")
+        self.conversations_dir.mkdir(exist_ok=True)
+        logger.info("🤖 Multi-Agent RAG Chatbot initialized")
+    def _load_dynamic_data(self):
+        """Load dynamic data from filter_options.json and add_district_metadata.py"""
+        # Load filter options
+        try:
+            fo = Path("filter_options.json")
+            if fo.exists():
+                with open(fo) as f:
+                    data = json.load(f)
+                    self.year_whitelist = [str(y).strip() for y in data.get("years", [])]
+                    self.source_whitelist = [str(s).strip() for s in data.get("sources", [])]
+                    self.district_whitelist = [str(d).strip() for d in data.get("districts", [])]
+            else:
+                # Fallback to default values
+                self.year_whitelist = ['2018', '2019', '2020', '2021', '2022', '2023', '2024']
+                self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
+                self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
+        except Exception as e:
+            logger.warning(f"Could not load filter options: {e}")
+            self.year_whitelist = ['2018', '2019', '2020', '2021', '2022', '2023', '2024']
+            self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
+            self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
+        # Enrich district list from add_district_metadata.py
+        try:
+            from add_district_metadata import DistrictMetadataProcessor
+            proc = DistrictMetadataProcessor()
+            names = set()
+            for key, mapping in proc.district_mappings.items():
+                if getattr(mapping, 'is_district', True):
+                    names.add(mapping.name)
+            if names:
+                merged = list(self.district_whitelist)
+                for n in sorted(names):
+                    if n not in merged:
+                        merged.append(n)
+                self.district_whitelist = merged
+                logger.info(f"🧭 District whitelist enriched: {len(self.district_whitelist)} entries")
+        except Exception as e:
+            logger.info(f"ℹ️ Could not enrich districts: {e}")
+        # Calculate current year dynamically
+        self.current_year = str(datetime.now().year)
+        self.previous_year = str(datetime.now().year - 1)
+        # Log the actual filter values for debugging
+        logger.info(f"📊 ACTUAL FILTER VALUES:")
+        logger.info(f"   Years: {self.year_whitelist}")
+        logger.info(f"   Sources: {self.source_whitelist}")
+        logger.info(f"   Districts: {len(self.district_whitelist)} districts (first 10: {self.district_whitelist[:10]})")
+    def _build_graph(self) -> StateGraph:
+        """Build the multi-agent LangGraph"""
+        graph = StateGraph(MultiAgentState)
+        # Add nodes for each agent
+        graph.add_node("main_agent", self._main_agent)
+        graph.add_node("rag_agent", self._rag_agent)
+        graph.add_node("response_agent", self._response_agent)
+        # Define the flow
+        graph.set_entry_point("main_agent")
+        # Main agent decides next step
+        graph.add_conditional_edges(
+            "main_agent",
+            self._should_call_rag,
+            {
+                "follow_up": END,
+                "call_rag": "rag_agent"
+            }
+        )
+        # RAG agent calls response agent
+        graph.add_edge("rag_agent", "response_agent")
+        # Response agent returns to main agent for potential follow-ups
+        graph.add_edge("response_agent", "main_agent")
+        return graph.compile()
+    def _should_call_rag(self, state: MultiAgentState) -> str:
+        """Determine if we should call RAG or ask follow-up"""
+        # If we already have a final response (from response agent), end
+        if state.get("final_response"):
+            return "follow_up"
+        context = state["query_context"]
+        if context and context.needs_follow_up:
+            return "follow_up"
+        return "call_rag"
+    def _main_agent(self, state: MultiAgentState) -> MultiAgentState:
+        """Main Agent: Handles conversation flow and follow-ups"""
+        logger.info("🎯 MAIN AGENT: Starting analysis")
+        # If we already have a final response from response agent, end gracefully
+        if state.get("final_response"):
+            logger.info("🎯 MAIN AGENT: Final response already exists, ending conversation flow")
+            return state
+        query = state["current_query"]
+        messages = state["messages"]
+        logger.info(f"🎯 MAIN AGENT: Extracting UI filters from query")
+        ui_filters = self._extract_ui_filters(query)
+        logger.info(f"🎯 MAIN AGENT: UI filters extracted: {ui_filters}")
+        # Analyze query context
+        logger.info(f"🎯 MAIN AGENT: Analyzing query context")
+        context = self._analyze_query_context(query, messages, ui_filters)
+        # Log agent decision
+        state["agent_logs"].append(f"MAIN AGENT: Context analyzed - district={context.has_district}, source={context.has_source}, year={context.has_year}")
+        logger.info(f"🎯 MAIN AGENT: Context analysis complete - district={context.has_district}, source={context.has_source}, year={context.has_year}")
+        # Store context
+        state["query_context"] = context
+        # If follow-up needed, generate response
+        if context.needs_follow_up:
+            logger.info(f"🎯 MAIN AGENT: Follow-up needed, generating question")
+            response = context.follow_up_question
+            state["final_response"] = response
+            state["last_ai_message_time"] = time.time()
+            logger.info(f"🎯 MAIN AGENT: Follow-up question generated: {response[:100]}...")
+        else:
+            logger.info("🎯 MAIN AGENT: No follow-up needed, proceeding to RAG")
+        return state
+    def _rag_agent(self, state: MultiAgentState) -> MultiAgentState:
+        """RAG Agent: Rewrites queries and applies filters"""
+        logger.info("🔍 RAG AGENT: Starting query rewriting and filter preparation")
+        context = state["query_context"]
+        messages = state["messages"]
+        logger.info(f"🔍 RAG AGENT: Context received - district={context.has_district}, source={context.has_source}, year={context.has_year}")
+        # Rewrite query for RAG
+        logger.info(f"🔍 RAG AGENT: Rewriting query for optimal retrieval")
+        rag_query = self._rewrite_query_for_rag(messages, context)
+        logger.info(f"🔍 RAG AGENT: Query rewritten: '{rag_query}'")
+        # Build filters
+        logger.info(f"🔍 RAG AGENT: Building filters from context")
+        filters = self._build_filters(context)
+        logger.info(f"🔍 RAG AGENT: Filters built: {filters}")
+        # Log RAG preparation
+        state["agent_logs"].append(f"RAG AGENT: Query='{rag_query}', Filters={filters}")
+        # Store for response agent
+        state["rag_query"] = rag_query
+        state["rag_filters"] = filters
+        logger.info(f"🔍 RAG AGENT: Preparation complete, ready for retrieval")
+        return state
+    def _response_agent(self, state: MultiAgentState) -> MultiAgentState:
+        """Response Agent: Generates final answer from retrieved documents"""
+        logger.info("📝 RESPONSE AGENT: Starting document retrieval and answer generation")
+        rag_query = state["rag_query"]
+        filters = state["rag_filters"]
+        logger.info(f"📝 RESPONSE AGENT: Starting RAG retrieval with query: '{rag_query}'")
+        logger.info(f"📝 RESPONSE AGENT: Using filters: {filters}")
+        # Perform RAG retrieval
+        logger.info(f"📝 RESPONSE AGENT: Calling pipeline manager for retrieval")
+        logger.info(f"🔍 ACTUAL RAG QUERY: '{rag_query}'")
+        logger.info(f"🔍 ACTUAL FILTERS: {filters}")
+        try:
+            # Extract filenames from filters if present
+            filenames = filters.get("filenames") if filters else None
+            result = self.pipeline_manager.run(
+                query=rag_query,
+                sources=filters.get("sources") if filters else None,
+                auto_infer_filters=False,
+                filters=filters if filters else None
+            )
+            logger.info(f"📝 RESPONSE AGENT: RAG retrieval completed - {len(result.sources)} documents retrieved")
+            logger.info(f"🔍 RETRIEVAL DEBUG: Result type: {type(result)}")
+            logger.info(f"🔍 RETRIEVAL DEBUG: Result sources type: {type(result.sources)}")
+            # logger.info(f"🔍 RETRIEVAL DEBUG: Result metadata: {getattr(result, 'metadata', 'No metadata')}")
+            if len(result.sources) == 0:
+                logger.warning(f"⚠️ NO DOCUMENTS RETRIEVED: Query='{rag_query}', Filters={filters}")
+                logger.warning(f"⚠️ RETRIEVAL DEBUG: This could be due to:")
+                logger.warning(f"   - Query too specific for available documents")
+                logger.warning(f"   - Filters too restrictive")
+                logger.warning(f"   - Vector store connection issues")
+                logger.warning(f"   - Embedding model issues")
+            else:
+                logger.info(f"✅ DOCUMENTS RETRIEVED: {len(result.sources)} documents found")
+                for i, doc in enumerate(result.sources[:3]):  # Log first 3 docs
+                    logger.info(f"   Doc {i+1}: {getattr(doc, 'metadata', {}).get('filename', 'Unknown')[:50]}...")
+            state["retrieved_documents"] = result.sources
+            state["agent_logs"].append(f"RESPONSE AGENT: Retrieved {len(result.sources)} documents")
+            # Check highest similarity score
+            highest_score = 0.0
+            if result.sources:
+                # Check reranked_score first (more accurate), fallback to original_score
+                for doc in result.sources:
+                    score = doc.metadata.get('reranked_score') or doc.metadata.get('original_score', 0.0)
+                    if score > highest_score:
+                        highest_score = score
+            logger.info(f"📝 RESPONSE AGENT: Highest similarity score: {highest_score:.4f}")
+            # If highest score is too low, don't use retrieved documents
+            if highest_score <= 0.15:
+                logger.warning(f"⚠️ RESPONSE AGENT: Low similarity score ({highest_score:.4f} <= 0.15), using LLM knowledge only")
+                response = self._generate_conversational_response_without_docs(
+                    state["current_query"],
+                    state["messages"]
+                )
+            else:
+                # Generate conversational response with documents
+                logger.info(f"📝 RESPONSE AGENT: Generating conversational response from {len(result.sources)} documents")
+                response = self._generate_conversational_response(
+                    state["current_query"],
+                    result.sources,
+                    result.answer,
+                    state["messages"]
+                )
+            logger.info(f"📝 RESPONSE AGENT: Response generated: {response[:100]}...")
+            state["final_response"] = response
+            state["last_ai_message_time"] = time.time()
+            logger.info(f"📝 RESPONSE AGENT: Answer generation complete")
+        except Exception as e:
+            logger.error(f"❌ RESPONSE AGENT ERROR: {e}")
+            state["final_response"] = "I apologize, but I encountered an error while retrieving information. Please try again."
+            state["last_ai_message_time"] = time.time()
+        return state
+    def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
+        """Extract UI filters from query"""
+        filters = {}
+        # Look for FILTER CONTEXT in query
+        if "FILTER CONTEXT:" in query:
+            # Extract the entire filter section (until USER QUERY: or end of query)
+            filter_section = query.split("FILTER CONTEXT:")[1]
+            if "USER QUERY:" in filter_section:
+                filter_section = filter_section.split("USER QUERY:")[0]
+            filter_section = filter_section.strip()
+            # Parse sources
+            if "Sources:" in filter_section:
+                sources_line = [line for line in filter_section.split('\n') if line.strip().startswith('Sources:')][0]
+                sources_str = sources_line.split("Sources:")[1].strip()
+                if sources_str and sources_str != "None":
+                    filters["sources"] = [s.strip() for s in sources_str.split(",")]
+            # Parse years
+            if "Years:" in filter_section:
+                years_line = [line for line in filter_section.split('\n') if line.strip().startswith('Years:')][0]
+                years_str = years_line.split("Years:")[1].strip()
+                if years_str and years_str != "None":
+                    filters["years"] = [y.strip() for y in years_str.split(",")]
+            # Parse districts
+            if "Districts:" in filter_section:
+                districts_line = [line for line in filter_section.split('\n') if line.strip().startswith('Districts:')][0]
+                districts_str = districts_line.split("Districts:")[1].strip()
+                if districts_str and districts_str != "None":
+                    filters["districts"] = [d.strip() for d in districts_str.split(",")]
+            # Parse filenames
+            if "Filenames:" in filter_section:
+                filenames_line = [line for line in filter_section.split('\n') if line.strip().startswith('Filenames:')][0]
+                filenames_str = filenames_line.split("Filenames:")[1].strip()
+                if filenames_str and filenames_str != "None":
+                    filters["filenames"] = [f.strip() for f in filenames_str.split(",")]
+        return filters
+    def _analyze_query_context(self, query: str, messages: List[Any], ui_filters: Dict[str, List[str]]) -> QueryContext:
+        """Analyze query context using LLM"""
+        logger.info(f"🔍 QUERY ANALYSIS: '{query[:50]}...' | UI filters: {ui_filters} | Messages: {len(messages)}")
+        # Build conversation context
+        conversation_context = ""
+        for i, msg in enumerate(messages[-6:]):  # Last 6 messages
+            if isinstance(msg, HumanMessage):
+                conversation_context += f"User: {msg.content}\n"
+            elif isinstance(msg, AIMessage):
+                conversation_context += f"Assistant: {msg.content}\n"
+        # Create analysis prompt
+        analysis_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content=f"""You are the Main Agent in an advanced multi-agent RAG system for audit report analysis.
+🎯 PRIMARY GOAL: Intelligently analyze user queries and determine the optimal conversation flow, whether that's answering directly, asking follow-ups, or proceeding to RAG retrieval.
+🧠 INTELLIGENCE LEVEL: You are a sophisticated conversational AI that can handle any type of user interaction - from greetings to complex audit queries.
+📊 YOUR EXPERTISE: You specialize in analyzing audit reports from various sources (Local Government, Ministry, Hospital, etc.) across different years and districts in Uganda.
+🔍 AVAILABLE FILTERS:
+- Years: {', '.join(self.year_whitelist)}
+- Current year: {self.current_year}, Previous year: {self.previous_year}
+- Sources: {', '.join(self.source_whitelist)}
+- Districts: {', '.join(self.district_whitelist[:50])}... (and {len(self.district_whitelist)-50} more)
+🎛️ UI FILTERS PROVIDED: {ui_filters}
+📋 UI FILTER HANDLING:
+- If UI filters contain multiple values (e.g., districts: ['Lwengo', 'Kiboga']), extract ALL values
+- For multiple districts: extract each district separately and validate each one
+- For multiple years: extract each year separately and validate each one
+- For multiple sources: extract each source separately and validate each one
+- UI filters take PRIORITY over conversation context - use them first
+🧭 CONVERSATION FLOW INTELLIGENCE:
+1. **GREETINGS & GENERAL CHAT**:
+   - If user greets you ("Hi", "Hello", "How are you"), respond warmly and guide them to audit-related questions
+   - Example: "Hello! I'm here to help you analyze audit reports. What would you like to know about budget allocations, expenditures, or audit findings?"
+2. **EDGE CASES**:
+   - Handle "What can you do?", "Help", "I don't know what to ask" with helpful guidance
+   - Example: "I can help you analyze audit reports! Try asking about budget allocations, salary management, PDM implementation, or any specific audit findings."
+3. **AUDIT QUERIES**:
+   - Extract ONLY values that EXACTLY match the available lists above
+   - DO NOT hallucinate or infer values not in the lists
+   - If user mentions "salary payroll management" - this is NOT a valid source filter
+   **YEAR EXTRACTION**:
+   - If user mentions "2023" and it's in the years list - extract "2023"
+   - If user mentions "2022 / 23" - extract ["2022", "2023"] (as a JSON array)
+   - If user mentions "2022-2023" - extract ["2022", "2023"] (as a JSON array)
+   - If user mentions "latest couple of years" - extract the 2 most recent years from available data as JSON array
+   - Always return years as JSON arrays when multiple years are mentioned
+   **DISTRICT EXTRACTION**:
+   - If user mentions "Kampala" and it's in the districts list - extract "Kampala"
+   - If user mentions "Pader District" - extract "Pader" (remove "District" suffix)
+   - If user mentions "Lwengo, Kiboga and Namutumba" - extract ["Lwengo", "Kiboga", "Namutumba"] (as JSON array)
+   - If user mentions "Lwengo District and Kiboga District" - extract ["Lwengo", "Kiboga"] (as JSON array, remove "District" suffix)
+   - Always return districts as JSON arrays when multiple districts are mentioned
+   - If no exact matches found, set extracted values to null
+4. **FILENAME FILTERING (MUTUALLY EXCLUSIVE)**:
+   - If UI provides filenames filter - ONLY use that, ignore all other filters (year, district, source)
+   - With filenames filter, no follow-ups needed - proceed directly to RAG
+   - When filenames are specified, skip filter inference entirely
+5. **HALLUCINATION PREVENTION**:
+   - If user asks about a specific report but NO filename is selected in UI and NONE is extracted from conversation - DO NOT hallucinate
+   - Clearly state: "I don't have any specific report selected. Could you please select a report from the list or tell me which report you'd like to analyze?"
+   - DO NOT pretend to know which report they mean
+   - DO NOT infer reports from context alone - only use explicitly mentioned reports
+6. **CONVERSATION CONTEXT AWARENESS**:
+   - ALWAYS consider the full conversation context when extracting filters
+   - If district was mentioned in previous messages, include it in current analysis
+   - If year was mentioned in previous messages, include it in current analysis
+   - If source was mentioned in previous messages, include it in current analysis
+   - Example: If conversation shows "User: Tell me about Pader District" then "User: 2023", extract both: district="Pader" and year="2023"
+5. **SMART FOLLOW-UP STRATEGY**:
+   - NEVER ask the same question twice in a row
+   - If user provides source info, ask for year or district next
+   - If user provides year info, ask for source or district next
+   - If user provides district info, ask for year or source next
+   - If user provides 2+ pieces of info, proceed to RAG instead of asking more
+   - Make follow-ups conversational and contextual, not robotic
+5. **DYNAMIC FOLLOW-UP EXAMPLES**:
+   - Budget queries: "What year are you interested in?" or "Which department - Local Government or Ministry?"
+   - PDM queries: "Which district are you interested in?" or "What year?"
+   - General queries: "Could you be more specific about what you'd like to know?"
+🎯 DECISION LOGIC:
+- If query is a greeting/general chat → needs_follow_up: true, provide helpful guidance
+- If query has 2+ pieces of info → needs_follow_up: false, proceed to RAG
+- If query has 1 piece of info → needs_follow_up: true, ask for missing piece
+- If query has 0 pieces of info → needs_follow_up: true, ask for clarification
+RESPOND WITH JSON ONLY:
+{{
+  "has_district": boolean,
+  "has_source": boolean,
+  "has_year": boolean,
+  "extracted_district": "single district name or JSON array of districts or null",
+  "extracted_source": "single source name or JSON array of sources or null",
+  "extracted_year": "single year or JSON array of years or null",
+  "confidence_score": 0.0-1.0,
+  "needs_follow_up": boolean,
+  "follow_up_question": "conversational question or helpful guidance or null"
+}}"""),
+            HumanMessage(content=f"""Query: {query}
+Conversation Context:
+{conversation_context}
+CRITICAL: You MUST analyze the FULL conversation context above, not just the current query.
+- If ANY district was mentioned in previous messages, extract it
+- If ANY year was mentioned in previous messages, extract it
+- If ANY source was mentioned in previous messages, extract it
+- Combine information from ALL messages in the conversation
+Analyze this query using ONLY the exact values provided above:""")
+        ])
+        try:
+            response = self.llm.invoke(analysis_prompt.format_messages())
+            # Clean the response to extract JSON
+            content = response.content.strip()
+            if content.startswith("```json"):
+                # Remove markdown formatting
+                content = content.replace("```json", "").replace("```", "").strip()
+            elif content.startswith("```"):
+                # Remove generic markdown formatting
+                content = content.replace("```", "").strip()
+            # Clean and parse JSON with better error handling
+            try:
+                # Remove comments (// and /* */) from JSON
+                import re
+                # Remove single-line comments
+                content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
+                # Remove multi-line comments
+                content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
+                analysis = json.loads(content)
+                logger.info(f"🔍 QUERY ANALYSIS: ✅ Parsed successfully")
+            except json.JSONDecodeError as e:
+                logger.error(f"❌ JSON parsing failed: {e}")
+                logger.error(f"❌ Raw content: {content[:200]}...")
+                # Try to extract JSON from text if embedded
+                import re
+                json_match = re.search(r'\{.*\}', content, re.DOTALL)
+                if json_match:
+                    try:
+                        # Clean the extracted JSON
+                        cleaned_json = json_match.group()
+                        cleaned_json = re.sub(r'//.*?$', '', cleaned_json, flags=re.MULTILINE)
+                        cleaned_json = re.sub(r'/\*.*?\*/', '', cleaned_json, flags=re.DOTALL)
+                        analysis = json.loads(cleaned_json)
+                        logger.info(f"🔍 QUERY ANALYSIS: ✅ Extracted and cleaned JSON from text")
+                    except json.JSONDecodeError as e2:
+                        logger.error(f"❌ Failed to extract JSON from text: {e2}")
+                        # Return fallback context
+                        context = QueryContext(
+                            has_district=False,
+                            has_source=False,
+                            has_year=False,
+                            extracted_district=None,
+                            extracted_source=None,
+                            extracted_year=None,
+                            confidence_score=0.0,
+                            needs_follow_up=True,
+                            follow_up_question="I apologize, but I'm having trouble processing your request. Could you please rephrase it or ask for help?"
+                        )
+                        return context
+                else:
+                    # Return fallback context
+                    context = QueryContext(
+                        has_district=False,
+                        has_source=False,
+                        has_year=False,
+                        extracted_district=None,
+                        extracted_source=None,
+                        extracted_year=None,
+                        confidence_score=0.0,
+                        needs_follow_up=True,
+                        follow_up_question="I apologize, but I'm having trouble processing your request. Could you please rephrase it or ask for help?"
+                    )
+                    return context
+            # Validate extracted values against whitelists
+            extracted_district = analysis.get("extracted_district")
+            extracted_source = analysis.get("extracted_source")
+            extracted_year = analysis.get("extracted_year")
+            logger.info(f"🔍 QUERY ANALYSIS: Raw extracted values - district: {extracted_district}, source: {extracted_source}, year: {extracted_year}")
+            # Validate district (handle both single values and arrays)
+            if extracted_district:
+                if isinstance(extracted_district, list):
+                    # Validate each district in the array
+                    valid_districts = []
+                    for district in extracted_district:
+                        if district in self.district_whitelist:
+                            valid_districts.append(district)
+                        else:
+                            # Try removing "District" suffix
+                            district_name = district.replace(" District", "").replace(" district", "")
+                            if district_name in self.district_whitelist:
+                                valid_districts.append(district_name)
+                    if valid_districts:
+                        extracted_district = valid_districts[0] if len(valid_districts) == 1 else valid_districts
+                        logger.info(f"🔍 QUERY ANALYSIS: Extracted districts: {extracted_district}")
+                    else:
+                        logger.warning(f"⚠️ No valid districts found in: '{extracted_district}'")
+                        extracted_district = None
+                else:
+                    # Single district validation
+                    if extracted_district not in self.district_whitelist:
+                        # Try removing "District" suffix
+                        district_name = extracted_district.replace(" District", "").replace(" district", "")
+                        if district_name in self.district_whitelist:
+                            logger.info(f"🔍 QUERY ANALYSIS: Normalized district '{extracted_district}' to '{district_name}'")
+                            extracted_district = district_name
+                        else:
+                            logger.warning(f"⚠️ Invalid district extracted: '{extracted_district}' not in whitelist")
+                            extracted_district = None
+            # Validate source (handle both single values and arrays)
+            if extracted_source:
+                if isinstance(extracted_source, list):
+                    # Validate each source in the array
+                    valid_sources = []
+                    for source in extracted_source:
+                        if source in self.source_whitelist:
+                            valid_sources.append(source)
+                        else:
+                            logger.warning(f"⚠️ Invalid source in array: '{source}' not in whitelist")
+                    if valid_sources:
+                        extracted_source = valid_sources[0] if len(valid_sources) == 1 else valid_sources
+                        logger.info(f"🔍 QUERY ANALYSIS: Extracted sources: {extracted_source}")
+                    else:
+                        logger.warning(f"⚠️ No valid sources found in: '{extracted_source}'")
+                        extracted_source = None
+                else:
+                    # Single source validation
+                    if extracted_source not in self.source_whitelist:
+                        logger.warning(f"⚠️ Invalid source extracted: '{extracted_source}' not in whitelist")
+                        extracted_source = None
+            # Validate year (handle both single values and arrays)
+            if extracted_year:
+                if isinstance(extracted_year, list):
+                    # Validate each year in the array
+                    valid_years = []
+                    for year in extracted_year:
+                        year_str = str(year)
+                        if year_str in self.year_whitelist:
+                            valid_years.append(year_str)
+                    if valid_years:
+                        extracted_year = valid_years[0] if len(valid_years) == 1 else valid_years
+                        logger.info(f"🔍 QUERY ANALYSIS: Extracted years: {extracted_year}")
+                    else:
+                        logger.warning(f"⚠️ No valid years found in: '{extracted_year}'")
+                        extracted_year = None
+                else:
+                    # Single year validation
+                    year_str = str(extracted_year)
+                    if year_str not in self.year_whitelist:
+                        logger.warning(f"⚠️ Invalid year extracted: '{extracted_year}' not in whitelist")
+                        extracted_year = None
+                    else:
+                        extracted_year = year_str
+            logger.info(f"🔍 QUERY ANALYSIS: Validated values - district: {extracted_district}, source: {extracted_source}, year: {extracted_year}")
+            # Create QueryContext object
+            context = QueryContext(
+                has_district=bool(extracted_district),
+                has_source=bool(extracted_source),
+                has_year=bool(extracted_year),
+                extracted_district=extracted_district,
+                extracted_source=extracted_source,
+                extracted_year=extracted_year,
+                ui_filters=ui_filters,
+                confidence_score=analysis.get("confidence_score", 0.0),
+                needs_follow_up=analysis.get("needs_follow_up", False),
+                follow_up_question=analysis.get("follow_up_question")
+            )
+            logger.info(f"🔍 QUERY ANALYSIS: Analysis complete - needs_follow_up: {context.needs_follow_up}, confidence: {context.confidence_score}")
+            # If filenames are provided in UI, skip follow-ups and proceed to RAG
+            if ui_filters and ui_filters.get("filenames"):
+                logger.info(f"🔍 QUERY ANALYSIS: Filenames provided, skipping follow-ups, proceeding to RAG")
+                context.needs_follow_up = False
+                context.follow_up_question = None
+            # Additional smart decision logic
+            if context.needs_follow_up:
+                # Check if we have enough information to proceed
+                info_count = sum([
+                    bool(context.extracted_district),
+                    bool(context.extracted_source),
+                    bool(context.extracted_year)
+                ])
+                # Check if user is asking for more info vs providing it
+                query_lower = query.lower()
+                is_requesting_info = any(phrase in query_lower for phrase in [
+                    "please provide", "could you provide", "can you provide",
+                    "what is", "what are", "how much", "which", "what year",
+                    "what district", "what source", "tell me about"
+                ])
+                # If we have 2+ pieces of info AND user is not requesting more info, proceed to RAG
+                if info_count >= 2 and not is_requesting_info:
+                    logger.info(f"🔍 QUERY ANALYSIS: Smart override - have {info_count} pieces of info and user not requesting more, proceeding to RAG")
+                    context.needs_follow_up = False
+                    context.follow_up_question = None
+                elif info_count >= 2 and is_requesting_info:
+                    logger.info(f"🔍 QUERY ANALYSIS: User requesting more info despite having {info_count} pieces, proceeding to RAG with comprehensive answer")
+                    context.needs_follow_up = False
+                    context.follow_up_question = None
+            return context
+        except Exception as e:
+            logger.error(f"❌ Query analysis failed: {e}")
+            # Fallback: proceed with RAG
+            return QueryContext(
+                has_district=bool(ui_filters.get("districts")),
+                has_source=bool(ui_filters.get("sources")),
+                has_year=bool(ui_filters.get("years")),
+                ui_filters=ui_filters,
+                confidence_score=0.5,
+                needs_follow_up=False
+            )
+    def _rewrite_query_for_rag(self, messages: List[Any], context: QueryContext) -> str:
+        """Rewrite query for optimal RAG retrieval"""
+        logger.info("🔄 QUERY REWRITING: Starting query rewrite for RAG")
+        logger.info(f"🔄 QUERY REWRITING: Processing {len(messages)} messages")
+        # Build conversation context
+        logger.info(f"🔄 QUERY REWRITING: Building conversation context from last 6 messages")
+        conversation_lines = []
+        for i, msg in enumerate(messages[-6:]):
+            if isinstance(msg, HumanMessage):
+                conversation_lines.append(f"User: {msg.content}")
+                logger.info(f"🔄 QUERY REWRITING: Message {i+1}: User - {msg.content[:50]}...")
+            elif isinstance(msg, AIMessage):
+                conversation_lines.append(f"Assistant: {msg.content}")
+                logger.info(f"🔄 QUERY REWRITING: Message {i+1}: Assistant - {msg.content[:50]}...")
+        convo_text = "\n".join(conversation_lines)
+        logger.info(f"🔄 QUERY REWRITING: Conversation context built ({len(convo_text)} chars)")
+        # Create rewrite prompt
+        rewrite_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content=f"""You are a query rewriter for RAG retrieval.
+GOAL: Create the best possible search query for document retrieval.
+CRITICAL RULES:
+1. Focus on the core information need from the conversation
+2. Remove meta-verbs like "summarize", "list", "compare", "how much", "what" - keep the content focus
+3. DO NOT include filter details (years, districts, sources) - these are applied separately as filters
+4. DO NOT include specific years, district names, or source types in the query
+5. Output ONE clear sentence suitable for vector search
+6. Keep it generic and focused on the topic/subject matter
+EXAMPLES:
+- "What are the top challenges in budget allocation?" → "budget allocation challenges"
+- "How were PDM administrative costs utilized in 2023?" → "PDM administrative costs utilization"
+- "Compare salary management across districts" → "salary management"
+- "How much was budget allocation for Local Government in 2023?" → "budget allocation"
+OUTPUT FORMAT:
+Provide your response in this exact format:
+EXPLANATION: [Your reasoning here]
+QUERY: [One clean sentence for retrieval]
+The QUERY line will be extracted and used directly for RAG retrieval."""),
+            HumanMessage(content=f"""Conversation:
+{convo_text}
+Rewrite the best retrieval query:""")
+        ])
+        try:
+            logger.info(f"🔄 QUERY REWRITING: Calling LLM for query rewrite")
+            response = self.llm.invoke(rewrite_prompt.format_messages())
+            logger.info(f"🔄 QUERY REWRITING: LLM response received: {response.content[:100]}...")
+            rewritten = response.content.strip()
+            # Extract only the QUERY line from the structured response
+            lines = rewritten.split('\n')
+            query_line = None
+            for line in lines:
+                if line.strip().startswith('QUERY:'):
+                    query_line = line.replace('QUERY:', '').strip()
+                    break
+            if query_line and len(query_line) > 5:
+                logger.info(f"🔄 QUERY REWRITING: Query rewritten successfully: '{query_line[:50]}...'")
+                return query_line
+            else:
+                logger.info(f"🔄 QUERY REWRITING: No QUERY line found or too short, using fallback")
+                # Fallback to last user message
+                for msg in reversed(messages):
+                    if isinstance(msg, HumanMessage):
+                        logger.info(f"🔄 QUERY REWRITING: Using fallback message: '{msg.content[:50]}...'")
+                        return msg.content
+                logger.info(f"🔄 QUERY REWRITING: Using default fallback")
+                return "audit report information"
+        except Exception as e:
+            logger.error(f"❌ QUERY REWRITING: Error during rewrite: {e}")
+            # Fallback
+            for msg in reversed(messages):
+                if isinstance(msg, HumanMessage):
+                    logger.info(f"🔄 QUERY REWRITING: Using error fallback message: '{msg.content[:50]}...'")
+                    return msg.content
+            logger.info(f"🔄 QUERY REWRITING: Using default error fallback")
+            return "audit report information"
+    def _build_filters(self, context: QueryContext) -> Dict[str, Any]:
+        """Build filters for RAG retrieval"""
+        logger.info("🔧 FILTER BUILDING: Starting filter construction")
+        filters = {}
+        # Check for filename filtering first (mutually exclusive)
+        if context.ui_filters and context.ui_filters.get("filenames"):
+            logger.info(f"🔧 FILTER BUILDING: Filename filtering requested (mutually exclusive mode)")
+            filters["filenames"] = context.ui_filters["filenames"]
+            logger.info(f"🔧 FILTER BUILDING: Added filenames filter: {context.ui_filters['filenames']}")
+            logger.info(f"🔧 FILTER BUILDING: Final filters: {filters}")
+            return filters  # Return early, skip all other filters
+        # UI filters take priority, but merge with extracted context if UI filters are incomplete
+        if context.ui_filters:
+            logger.info(f"🔧 FILTER BUILDING: UI filters present: {context.ui_filters}")
+            # Add UI filters first
+            if context.ui_filters.get("sources"):
+                filters["sources"] = context.ui_filters["sources"]
+                logger.info(f"🔧 FILTER BUILDING: Added sources filter from UI: {context.ui_filters['sources']}")
+            if context.ui_filters.get("years"):
+                filters["year"] = context.ui_filters["years"]
+                logger.info(f"🔧 FILTER BUILDING: Added years filter from UI: {context.ui_filters['years']}")
+            if context.ui_filters.get("districts"):
+                # Normalize district names to title case (match Qdrant metadata format)
+                normalized_districts = [d.title() for d in context.ui_filters['districts']]
+                filters["district"] = normalized_districts
+                logger.info(f"🔧 FILTER BUILDING: Added districts filter from UI: {context.ui_filters['districts']} → normalized: {normalized_districts}")
+            # Merge with extracted context for missing filters
+            if not filters.get("year") and context.extracted_year:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_year, list):
+                    filters["year"] = context.extracted_year
+                else:
+                    filters["year"] = [context.extracted_year]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted year filter (UI missing): {context.extracted_year}")
+            if not filters.get("district") and context.extracted_district:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_district, list):
+                    # Normalize district names to title case (match Qdrant metadata format)
+                    normalized = [d.title() for d in context.extracted_district]
+                    filters["district"] = normalized
+                else:
+                    filters["district"] = [context.extracted_district.title()]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted district filter (UI missing): {context.extracted_district}")
+            if not filters.get("sources") and context.extracted_source:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_source, list):
+                    filters["sources"] = context.extracted_source
+                else:
+                    filters["sources"] = [context.extracted_source]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted source filter (UI missing): {context.extracted_source}")
+        else:
+            logger.info(f"🔧 FILTER BUILDING: No UI filters, using extracted context")
+            # Use extracted context
+            if context.extracted_source:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_source, list):
+                    filters["sources"] = context.extracted_source
+                else:
+                    filters["sources"] = [context.extracted_source]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted source filter: {context.extracted_source}")
+            if context.extracted_year:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_year, list):
+                    filters["year"] = context.extracted_year
+                else:
+                    filters["year"] = [context.extracted_year]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted year filter: {context.extracted_year}")
+            if context.extracted_district:
+                # Handle both single values and arrays
+                if isinstance(context.extracted_district, list):
+                    filters["district"] = context.extracted_district
+                else:
+                    filters["district"] = [context.extracted_district]
+                logger.info(f"🔧 FILTER BUILDING: Added extracted district filter: {context.extracted_district}")
+        logger.info(f"🔧 FILTER BUILDING: Final filters: {filters}")
+        return filters
+    def _generate_conversational_response(self, query: str, documents: List[Any], rag_answer: str, messages: List[Any]) -> str:
+        """Generate conversational response from RAG results"""
+        logger.info("💬 RESPONSE GENERATION: Starting conversational response generation")
+        logger.info(f"💬 RESPONSE GENERATION: Processing {len(documents)} documents")
+        logger.info(f"💬 RESPONSE GENERATION: Query: '{query[:50]}...'")
+        # Create response prompt
+        logger.info(f"💬 RESPONSE GENERATION: Building response prompt")
+        response_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
+RULES:
+1. Answer the user's question directly and clearly
+2. Use the retrieved documents as evidence
+3. Be conversational, not technical
+4. Don't mention scores, retrieval details, or technical implementation
+5. If relevant documents were found, reference them naturally
+6. If no relevant documents, explain based on your knowledge (if you have it) or just say you do not have enough information.
+7. If the passages have useful facts or numbers, use them in your answer.
+8. When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+9. Do not use the sentence 'Doc i says ...' to say where information came from.
+10. If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
+11. Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+12. If it makes sense, use bullet points and lists to make your answers easier to understand.
+13. You do not need to use every passage. Only use the ones that help answer the question.
+14. If the documents do not have the information needed to answer the question, just say you do not have enough information.
+TONE: Professional but friendly, like talking to a colleague."""),
+            HumanMessage(content=f"""User Question: {query}
+Retrieved Documents: {len(documents)} documents found
+RAG Answer: {rag_answer}
+Generate a conversational response:""")
+        ])
+        try:
+            logger.info(f"💬 RESPONSE GENERATION: Calling LLM for final response")
+            response = self.llm.invoke(response_prompt.format_messages())
+            logger.info(f"💬 RESPONSE GENERATION: LLM response received: {response.content[:100]}...")
+            return response.content.strip()
+        except Exception as e:
+            logger.error(f"❌ RESPONSE GENERATION: Error during generation: {e}")
+            logger.info(f"💬 RESPONSE GENERATION: Using RAG answer as fallback")
+            return rag_answer  # Fallback to RAG answer
+    def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
+        """Generate conversational response using only LLM knowledge and conversation history"""
+        logger.info("💬 RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
+        logger.info(f"💬 RESPONSE GENERATION (NO DOCS): Query: '{query[:50]}...'")
+        # Build conversation context
+        conversation_context = ""
+        for i, msg in enumerate(messages[-6:]):  # Last 6 messages for context
+            if isinstance(msg, HumanMessage):
+                conversation_context += f"User: {msg.content}\n"
+            elif isinstance(msg, AIMessage):
+                conversation_context += f"Assistant: {msg.content}\n"
+        # Create response prompt
+        logger.info(f"💬 RESPONSE GENERATION (NO DOCS): Building response prompt")
+        response_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
+RULES:
+1. Answer the user's question directly and clearly based on your knowledge
+2. Use conversation history for context
+3. Be conversational, not technical
+4. Acknowledge if the answer is based on general knowledge rather than specific documents
+5. Stay professional but friendly
+TONE: Professional but friendly, like talking to a colleague."""),
+            HumanMessage(content=f"""Current Question: {query}
+Conversation History:
+{conversation_context}
+Generate a conversational response based on your knowledge:""")
+        ])
+        try:
+            logger.info(f"💬 RESPONSE GENERATION (NO DOCS): Calling LLM")
+            response = self.llm.invoke(response_prompt.format_messages())
+            logger.info(f"💬 RESPONSE GENERATION (NO DOCS): LLM response received: {response.content[:100]}...")
+            return response.content.strip()
+        except Exception as e:
+            logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error during generation: {e}")
+            return "I apologize, but I encountered an error. Please try asking your question differently."
+    def chat(self, user_input: str, conversation_id: str = "default") -> Dict[str, Any]:
+        """Main chat interface"""
+        logger.info(f"💬 MULTI-AGENT CHAT: Processing '{user_input[:50]}...'")
+        # Load conversation
+        logger.info(f"💬 MULTI-AGENT CHAT: Loading conversation {conversation_id}")
+        conversation_file = self.conversations_dir / f"{conversation_id}.json"
+        conversation = self._load_conversation(conversation_file)
+        logger.info(f"💬 MULTI-AGENT CHAT: Loaded {len(conversation['messages'])} previous messages")
+        # Add user message
+        conversation["messages"].append(HumanMessage(content=user_input))
+        logger.info(f"💬 MULTI-AGENT CHAT: Added user message to conversation")
+        # Prepare state
+        logger.info(f"💬 MULTI-AGENT CHAT: Preparing state for graph execution")
+        state = MultiAgentState(
+            conversation_id=conversation_id,
+            messages=conversation["messages"],
+            current_query=user_input,
+            query_context=None,
+            rag_query=None,
+            rag_filters=None,
+            retrieved_documents=None,
+            final_response=None,
+            agent_logs=[],
+            conversation_context=conversation.get("context", {}),
+            session_start_time=conversation["session_start_time"],
+            last_ai_message_time=conversation["last_ai_message_time"]
+        )
+        # Run multi-agent graph
+        logger.info(f"💬 MULTI-AGENT CHAT: Executing multi-agent graph")
+        final_state = self.graph.invoke(state)
+        logger.info(f"💬 MULTI-AGENT CHAT: Graph execution completed")
+        # Add AI response to conversation
+        if final_state["final_response"]:
+            conversation["messages"].append(AIMessage(content=final_state["final_response"]))
+            logger.info(f"💬 MULTI-AGENT CHAT: Added AI response to conversation")
+        # Update conversation
+        conversation["last_ai_message_time"] = final_state["last_ai_message_time"]
+        conversation["context"] = final_state["conversation_context"]
+        # Save conversation
+        logger.info(f"💬 MULTI-AGENT CHAT: Saving conversation")
+        self._save_conversation(conversation_file, conversation)
+        logger.info("✅ MULTI-AGENT CHAT: Completed")
+        # Return response and RAG results
+        return {
+            'response': final_state["final_response"],
+            'rag_result': {
+                'sources': final_state["retrieved_documents"] or [],
+                'answer': final_state["final_response"]
+            },
+            'agent_logs': final_state["agent_logs"],
+            'actual_rag_query': final_state.get("rag_query", "")
+        }
+    def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
+        """Load conversation from file"""
+        if conversation_file.exists():
+            try:
+                with open(conversation_file) as f:
+                    data = json.load(f)
+                    # Convert message dicts back to LangChain messages
+                    messages = []
+                    for msg_data in data.get("messages", []):
+                        if msg_data["type"] == "human":
+                            messages.append(HumanMessage(content=msg_data["content"]))
+                        elif msg_data["type"] == "ai":
+                            messages.append(AIMessage(content=msg_data["content"]))
+                    data["messages"] = messages
+                    return data
+            except Exception as e:
+                logger.warning(f"Could not load conversation: {e}")
+        # Return default conversation
+        return {
+            "messages": [],
+            "session_start_time": time.time(),
+            "last_ai_message_time": time.time(),
+            "context": {}
+        }
+    def _save_conversation(self, conversation_file: Path, conversation: Dict[str, Any]):
+        """Save conversation to file"""
+        try:
+            # Convert messages to serializable format
+            messages_data = []
+            for msg in conversation["messages"]:
+                if isinstance(msg, HumanMessage):
+                    messages_data.append({"type": "human", "content": msg.content})
+                elif isinstance(msg, AIMessage):
+                    messages_data.append({"type": "ai", "content": msg.content})
+            conversation_data = {
+                "messages": messages_data,
+                "session_start_time": conversation["session_start_time"],
+                "last_ai_message_time": conversation["last_ai_message_time"],
+                "context": conversation.get("context", {})
+            }
+            with open(conversation_file, 'w') as f:
+                json.dump(conversation_data, f, indent=2)
+        except Exception as e:
+            logger.error(f"Could not save conversation: {e}")
+def get_multi_agent_chatbot():
+    """Get multi-agent chatbot instance"""
+    return MultiAgentRAGChatbot()
+if __name__ == "__main__":
+    # Test the multi-agent system
+    chatbot = MultiAgentRAGChatbot()
+    # Test conversation
+    result = chatbot.chat("List me top 10 challenges in budget allocation for the last 3 years")
+    print("Response:", result['response'])
+    print("Agent Logs:", result['agent_logs'])

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
-pandas
-streamlit

+streamlit>=1.28.0
+langchain>=0.1.0
+langchain-core>=0.1.0
+langgraph>=0.0.20
+qdrant-client>=1.7.0
+python-dotenv>=1.0.0
+openai>=1.0.0
+snowflake-connector-python>=4.0.0
+pydantic>=2.0.0

smart_chatbot.py ADDED Viewed

	@@ -0,0 +1,1098 @@

+"""
+Intelligent RAG Chatbot with Smart Query Analysis and Conversation Management
+This chatbot provides intelligent conversation flow with:
+- Smart query analysis and expansion
+- Single LangSmith conversation traces
+- Local conversation logging
+- Context-aware RAG retrieval
+- Natural conversation without technical jargon
+"""
+import os
+import json
+import time
+import logging
+from pathlib import Path
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, TypedDict
+import re
+from langgraph.graph import StateGraph, END
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from src.pipeline import PipelineManager
+from src.config.loader import load_config
+@dataclass
+class QueryAnalysis:
+    """Analysis result of a user query"""
+    has_district: bool
+    has_source: bool
+    has_year: bool
+    extracted_district: Optional[str]
+    extracted_source: Optional[str]
+    extracted_year: Optional[str]
+    confidence_score: float
+    can_answer_directly: bool
+    missing_filters: List[str]
+    suggested_follow_up: Optional[str]
+    expanded_query: Optional[str] = None  # Query expansion for better RAG
+class ConversationState(TypedDict):
+    """State for the conversation flow"""
+    conversation_id: str
+    messages: List[Any]
+    current_query: str
+    query_analysis: Optional[QueryAnalysis]
+    rag_query: Optional[str]
+    rag_result: Optional[Any]
+    final_response: Optional[str]
+    conversation_context: Dict[str, Any]  # Store conversation context
+    session_start_time: float
+    last_ai_message_time: float
+class IntelligentRAGChatbot:
+    """Intelligent chatbot with smart query analysis and conversation management"""
+    def __init__(self, suppress_logs=False):
+        """Initialize the intelligent chatbot"""
+        # Setup logger to avoid cluttering UI
+        self.logger = logging.getLogger(__name__)
+        if suppress_logs:
+            self.logger.setLevel(logging.CRITICAL)  # Suppress all logs
+        else:
+            self.logger.setLevel(logging.INFO)
+            if not self.logger.handlers:
+                handler = logging.StreamHandler()
+                formatter = logging.Formatter('%(message)s')
+                handler.setFormatter(formatter)
+                self.logger.addHandler(handler)
+        self.logger.info("🤖 INITIALIZING: Intelligent RAG Chatbot")
+        # Load configuration first
+        self.config = load_config()
+        # Use the same LLM configuration as the existing system
+        from auditqa.llm.adapters import get_llm_client
+        # Get LLM client using the same configuration
+        reader_config = self.config.get("reader", {})
+        default_type = reader_config.get("default_type", "INF_PROVIDERS")
+        # Convert to lowercase as that's how it's registered
+        provider_name = default_type.lower()
+        self.llm_adapter = get_llm_client(provider_name, self.config)
+        # Create a simple wrapper for LangChain compatibility
+        class LLMWrapper:
+            def __init__(self, adapter):
+                self.adapter = adapter
+            def invoke(self, messages):
+                # Convert LangChain messages to the format expected by the adapter
+                if isinstance(messages, list):
+                    # Convert LangChain messages to dict format
+                    message_dicts = []
+                    for msg in messages:
+                        if hasattr(msg, 'content'):
+                            role = "user" if isinstance(msg, HumanMessage) else "assistant"
+                            message_dicts.append({"role": role, "content": msg.content})
+                        else:
+                            message_dicts.append({"role": "user", "content": str(msg)})
+                else:
+                    # Single message
+                    message_dicts = [{"role": "user", "content": str(messages)}]
+                # Use the adapter to generate response
+                llm_response = self.adapter.generate(message_dicts)
+                # Return in LangChain format
+                class MockResponse:
+                    def __init__(self, content):
+                        self.content = content
+                return MockResponse(llm_response.content)
+        self.llm = LLMWrapper(self.llm_adapter)
+        # Initialize pipeline manager for RAG
+        self.logger.info("🔧 PIPELINE: Initializing PipelineManager...")
+        self.pipeline_manager = PipelineManager(self.config)
+        # Ensure vectorstore is connected
+        self.logger.info("🔗 VECTORSTORE: Connecting to Qdrant...")
+        try:
+            vectorstore = self.pipeline_manager.vectorstore_manager.connect_to_existing()
+            self.logger.info("✅ VECTORSTORE: Connected successfully")
+        except Exception as e:
+            self.logger.error(f"❌ VECTORSTORE: Connection failed: {e}")
+        # Fix LLM client to use the same provider as chatbot
+        self.logger.info("🔧 LLM: Fixing PipelineManager LLM client...")
+        self.pipeline_manager.llm_client = self.llm_adapter
+        self.logger.info("✅ LLM: PipelineManager now uses same LLM as chatbot")
+        self.logger.info("✅ PIPELINE: PipelineManager initialized")
+        # Available metadata for filtering
+        self.available_metadata = {
+            'sources': [
+                'KCCA', 'MAAIF', 'MWTS', 'Gulu DLG', 'Kalangala DLG', 'Namutumba DLG',
+                'Lwengo DLG', 'Kiboga DLG', 'Annual Consolidated OAG', 'Consolidated',
+                'Hospital', 'Local Government', 'Ministry, Department and Agency',
+                'Project', 'Thematic', 'Value for Money'
+            ],
+            'years': ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025'],
+            'districts': [
+                'Gulu', 'Kalangala', 'Kampala', 'Namutumba', 'Lwengo', 'Kiboga',
+                'Fort Portal', 'Arua', 'Kasese', 'Kabale', 'Masindi', 'Mbale', 'Jinja', 'Masaka', 'Mbarara',
+                'KCCA'
+            ]
+        }
+        # Try to load district whitelist from filter_options.json
+        try:
+            fo = Path("filter_options.json")
+            if fo.exists():
+                with open(fo) as f:
+                    data = json.load(f)
+                    if isinstance(data, dict) and data.get("districts"):
+                        self.district_whitelist = [d.strip() for d in data["districts"] if d]
+                    else:
+                        self.district_whitelist = self.available_metadata['districts']
+            else:
+                self.district_whitelist = self.available_metadata['districts']
+        except Exception:
+            self.district_whitelist = self.available_metadata['districts']
+        # Enrich whitelist from add_district_metadata.py if available
+        try:
+            from add_district_metadata import DistrictMetadataProcessor
+            proc = DistrictMetadataProcessor()
+            names = set()
+            for key, mapping in proc.district_mappings.items():
+                if getattr(mapping, 'is_district', True):
+                    names.add(mapping.name)
+            if names:
+                # Merge while preserving order: existing first, then new ones not present
+                merged = list(self.district_whitelist)
+                for n in sorted(names):
+                    if n not in merged:
+                        merged.append(n)
+                self.district_whitelist = merged
+                self.logger.info(f"🧭 District whitelist enriched: {len(self.district_whitelist)} entries")
+        except Exception as e:
+            self.logger.info(f"ℹ️ Could not enrich districts from add_district_metadata: {e}")
+        # Get dynamic year list from filter_options.json
+        try:
+            fo = Path("filter_options.json")
+            if fo.exists():
+                with open(fo) as f:
+                    data = json.load(f)
+                    if isinstance(data, dict) and data.get("years"):
+                        self.year_whitelist = [str(y).strip() for y in data["years"] if y]
+                    else:
+                        self.year_whitelist = self.available_metadata['years']
+            else:
+                self.year_whitelist = self.available_metadata['years']
+        except Exception:
+            self.year_whitelist = self.available_metadata['years']
+        # Calculate current year dynamically
+        from datetime import datetime
+        self.current_year = str(datetime.now().year)
+        self.previous_year = str(datetime.now().year - 1)
+        # Data context for system prompt
+        self.data_context = self._load_data_context()
+        # Build the LangGraph
+        self.graph = self._build_graph()
+        # Conversation logging
+        self.conversations_dir = Path("conversations")
+        self.conversations_dir.mkdir(exist_ok=True)
+    def _load_data_context(self) -> str:
+        """Load and analyze data context for system prompt"""
+        try:
+            # Try to load from generated context file
+            context_file = Path("data_context.md")
+            if context_file.exists():
+                with open(context_file) as f:
+                    return f.read()
+            # Fallback to basic analysis
+            reports_dir = Path("reports")
+            testset_dir = Path("outputs/datasets/testset")
+            context_parts = []
+            # Report analysis
+            if reports_dir.exists():
+                report_folders = [d for d in reports_dir.iterdir() if d.is_dir()]
+                context_parts.append(f"📊 Available Reports: {len(report_folders)} audit report folders")
+                # Get year range
+                years = []
+                for folder in report_folders:
+                    if "2018" in folder.name:
+                        years.append("2018")
+                    elif "2019" in folder.name:
+                        years.append("2019")
+                    elif "2020" in folder.name:
+                        years.append("2020")
+                    elif "2021" in folder.name:
+                        years.append("2021")
+                    elif "2022" in folder.name:
+                        years.append("2022")
+                    elif "2023" in folder.name:
+                        years.append("2023")
+                if years:
+                    context_parts.append(f"📅 Years covered: {', '.join(sorted(set(years)))}")
+            # Test dataset analysis
+            if testset_dir.exists():
+                test_files = list(testset_dir.glob("*.json"))
+                context_parts.append(f"🧪 Test dataset: {len(test_files)} files with sample questions")
+            return "\n".join(context_parts) if context_parts else "📊 Audit report database with comprehensive coverage"
+        except Exception as e:
+            self.logger.warning(f"⚠️ Could not load data context: {e}")
+            return "📊 Comprehensive audit report database"
+    def _build_graph(self) -> StateGraph:
+        """Build the LangGraph for intelligent conversation flow"""
+        # Define the graph
+        workflow = StateGraph(ConversationState)
+        # Add nodes
+        workflow.add_node("analyze_query", self._analyze_query)
+        workflow.add_node("decide_action", self._decide_action)
+        workflow.add_node("perform_rag", self._perform_rag)
+        workflow.add_node("ask_follow_up", self._ask_follow_up)
+        workflow.add_node("generate_response", self._generate_response)
+        # Add edges
+        workflow.add_edge("analyze_query", "decide_action")
+        # Conditional edges from decide_action
+        workflow.add_conditional_edges(
+            "decide_action",
+            self._should_perform_rag,
+            {
+                "rag": "perform_rag",
+                "follow_up": "ask_follow_up"
+            }
+        )
+        # From perform_rag, go to generate_response
+        workflow.add_edge("perform_rag", "generate_response")
+        # From ask_follow_up, end
+        workflow.add_edge("ask_follow_up", END)
+        # From generate_response, end
+        workflow.add_edge("generate_response", END)
+        # Set entry point
+        workflow.set_entry_point("analyze_query")
+        return workflow.compile()
+    def _extract_districts_list(self, text: str) -> List[str]:
+        """Extract one or more districts from free text using whitelist matching.
+        - Case-insensitive substring match for each known district name
+        - Handles multi-district inputs like "Lwengo Kiboga District & Namutumba"
+        """
+        if not text:
+            return []
+        q = text.lower()
+        found: List[str] = []
+        for name in self.district_whitelist:
+            n = name.lower()
+            if n in q:
+                # Map Kampala -> KCCA canonical
+                canonical = 'KCCA' if name.lower() == 'kampala' else name
+                if canonical not in found:
+                    found.append(canonical)
+        return found
+    def _extract_years_list(self, text: str) -> List[str]:
+        """Extract year list from text, supporting forms like '2022 / 23', '2022-2023', '2022–23'."""
+        if not text:
+            return []
+        years: List[str] = []
+        q = text
+        # Full 4-digit years
+        for y in re.findall(r"\b(20\d{2})\b", q):
+            if y not in years:
+                years.append(y)
+        # Shorthand like 2022/23 or 2022-23
+        for m in re.finditer(r"\b(20\d{2})\s*[\-/–]\s*(\d{2})\b", q):
+            y1 = m.group(1)
+            y2_short = int(m.group(2))
+            y2 = f"20{y2_short:02d}"
+            for y in [y1, y2]:
+                if y not in years:
+                    years.append(y)
+        return years
+    def _analyze_query(self, state: ConversationState) -> ConversationState:
+        """Analyze the user query with conversation context"""
+        query = state["current_query"]
+        conversation_context = state.get("conversation_context", {})
+        self.logger.info(f"🧠 QUERY ANALYSIS: Starting analysis for: '{query[:50]}...'")
+        # Build conversation context for analysis
+        context_info = ""
+        if conversation_context:
+            context_info = f"\n\nConversation context:\n"
+            for key, value in conversation_context.items():
+                if value:
+                    context_info += f"- {key}: {value}\n"
+        # Also include recent conversation messages for better context
+        recent_messages = state.get("messages", [])
+        if recent_messages and len(recent_messages) > 1:
+            context_info += f"\nRecent conversation:\n"
+            # Get last 3 messages for context
+            for msg in recent_messages[-3:]:
+                if hasattr(msg, 'content'):
+                    role = "User" if isinstance(msg, HumanMessage) else "Assistant"
+                    context_info += f"- {role}: {msg.content[:100]}...\n"
+        # Create analysis prompt with data context
+        analysis_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content=f"""You are an expert at analyzing audit report queries. Your job is to extract specific information and determine if a query can be answered directly.
+{self.data_context}
+DISTRICT RECOGNITION RULES:
+- Kampala = KCCA (Kampala Capital City Authority)
+- Available districts: {', '.join(self.district_whitelist[:15])}... (and {len(self.district_whitelist)-15} more)
+- DLG = District Local Government
+- Uganda has {len(self.district_whitelist)} districts - recognize common ones
+SOURCE RECOGNITION RULES:
+- KCCA = Kampala Capital City Authority
+- MAAIF = Ministry of Agriculture, Animal Industry and Fisheries
+- MWTS = Ministry of Works and Transport
+- OAG = Office of the Auditor General
+- Consolidated = Annual Consolidated reports
+YEAR RECOGNITION RULES:
+- Available years: {', '.join(self.year_whitelist)}
+- Current year is {self.current_year} - use this to reason about relative years
+- If user mentions "last year", "previous year" - infer {self.previous_year}
+- If user mentions "this year", "current year" - infer {self.current_year}
+Analysis rules:
+1. Be SMART - if you have enough context to search, do it
+2. Use conversation context to fill in missing information
+3. For budget/expenditure queries, try to infer missing details from context
+4. Current year is {self.current_year} - use this to reason about relative years
+5. If user mentions "last year", "previous year" - infer {self.previous_year}
+6. If user mentions "this year", "current year" - infer {self.current_year}
+7. If user mentions a department/ministry, infer the source
+8. If user is getting frustrated or asking for results, proceed with RAG even if not perfect
+9. Recognize Kampala as a district (KCCA)
+IMPORTANT: You must respond with ONLY valid JSON. No additional text.
+Return your analysis as JSON with these exact fields:
+{{
+  "has_district": boolean,
+  "has_source": boolean,
+  "has_year": boolean,
+  "extracted_district": "string or null",
+  "extracted_source": "string or null",
+  "extracted_year": "string or null",
+  "confidence_score": 0.0-1.0,
+  "can_answer_directly": boolean,
+  "missing_filters": ["list", "of", "missing", "filters"],
+  "suggested_follow_up": "string or null",
+  "expanded_query": "string or null"
+}}
+The expanded_query should be a natural language query that combines the original question with any inferred context for better RAG retrieval."""),
+            HumanMessage(content=f"Analyze this query: '{query}'{context_info}")
+        ])
+        # Get analysis from LLM
+        response = self.llm.invoke(analysis_prompt.format_messages())
+        try:
+            # Clean the response content to extract JSON
+            content = response.content.strip()
+            # Try to find JSON in the response
+            if content.startswith('{') and content.endswith('}'):
+                json_content = content
+            else:
+                # Try to extract JSON from the response
+                import re
+                json_match = re.search(r'\{.*\}', content, re.DOTALL)
+                if json_match:
+                    json_content = json_match.group()
+                else:
+                    raise json.JSONDecodeError("No JSON found in response", content, 0)
+            # Parse JSON response
+            analysis_data = json.loads(json_content)
+            query_analysis = QueryAnalysis(
+                has_district=analysis_data.get("has_district", False),
+                has_source=analysis_data.get("has_source", False),
+                has_year=analysis_data.get("has_year", False),
+                extracted_district=analysis_data.get("extracted_district"),
+                extracted_source=analysis_data.get("extracted_source"),
+                extracted_year=analysis_data.get("extracted_year"),
+                confidence_score=analysis_data.get("confidence_score", 0.0),
+                can_answer_directly=analysis_data.get("can_answer_directly", False),
+                missing_filters=analysis_data.get("missing_filters", []),
+                suggested_follow_up=analysis_data.get("suggested_follow_up"),
+                expanded_query=analysis_data.get("expanded_query")
+            )
+        except (json.JSONDecodeError, KeyError, AttributeError) as e:
+            self.logger.info(f"⚠️ JSON parsing failed: {e}")
+            # Fallback analysis - be more permissive
+            query_lower = query.lower()
+            # Simple keyword matching - improved district recognition
+            has_district = any(district.lower() in query_lower for district in [
+                'gulu', 'kalangala', 'kampala', 'namutumba', 'lwengo', 'kiboga', 'kcca', 'maaif', 'mwts'
+            ])
+            # Special case: Kampala = KCCA
+            if 'kampala' in query_lower and not has_district:
+                has_district = True
+            has_source = any(source.lower() in query_lower for source in [
+                'kcca', 'maaif', 'mwts', 'gulu', 'kalangala', 'consolidated', 'oag', 'government'
+            ])
+            # Check for year mentions using dynamic year list
+            has_year = any(year in query_lower for year in self.year_whitelist)
+            # Also check for explicit relative year terms
+            has_year = has_year or any(term in query_lower for term in [
+                'this year', 'last year', 'previous year', 'current year'
+            ])
+            # Extract specific values
+            extracted_district = None
+            extracted_source = None
+            extracted_year = None
+            # Extract districts using comprehensive whitelist
+            for district_name in self.district_whitelist:
+                if district_name.lower() in query_lower:
+                    extracted_district = district_name
+                    break
+            # Also check common aliases
+            district_aliases = {
+                'kampala': 'Kampala',
+                'kcca': 'Kampala',
+                'gulu': 'Gulu',
+                'kalangala': 'Kalangala'
+            }
+            for alias, full_name in district_aliases.items():
+                if alias in query_lower and not extracted_district:
+                    extracted_district = full_name
+                    break
+            for source in ['kcca', 'maaif', 'mwts', 'consolidated', 'oag']:
+                if source in query_lower:
+                    extracted_source = source.upper()
+                    break
+            # Extract year using dynamic year list
+            for year in self.year_whitelist:
+                if year in query_lower:
+                    extracted_year = year
+                    has_year = True
+                    break
+            # Only handle relative year terms if explicitly mentioned
+            if not extracted_year:
+                if 'last year' in query_lower or 'previous year' in query_lower:
+                    extracted_year = self.previous_year
+                    has_year = True
+                elif 'this year' in query_lower or 'current year' in query_lower:
+                    extracted_year = self.current_year
+                    has_year = True
+                elif 'recent' in query_lower and 'year' in query_lower:
+                    # Use the most recent year from available data
+                    extracted_year = max(self.year_whitelist) if self.year_whitelist else self.previous_year
+                    has_year = True
+            # Be more permissive - if we have some context, try to answer
+            missing_filters = []
+            if not has_district:
+                missing_filters.append("district")
+            if not has_source:
+                missing_filters.append("source")
+            if not has_year:
+                missing_filters.append("year")
+            # If user seems frustrated or asking for results, be more permissive
+            frustration_indicators = ['already', 'just said', 'specified', 'provided', 'crazy', 'answer']
+            is_frustrated = any(indicator in query_lower for indicator in frustration_indicators)
+            can_answer_directly = len(missing_filters) <= 1 or is_frustrated  # More permissive
+            confidence_score = 0.8 if can_answer_directly else 0.3
+            # Generate follow-up suggestion
+            if missing_filters and not is_frustrated:
+                if "district" in missing_filters and "source" in missing_filters:
+                    suggested_follow_up = "I'd be happy to help you with that information! Could you please specify which district and department/ministry you're asking about?"
+                elif "district" in missing_filters:
+                    suggested_follow_up = "Thanks for your question! Could you please specify which district you're asking about?"
+                elif "source" in missing_filters:
+                    suggested_follow_up = "I can help you with that! Could you please specify which department or ministry you're asking about?"
+                elif "year" in missing_filters:
+                    suggested_follow_up = "Great question! Could you please specify which year you're interested in?"
+                else:
+                    suggested_follow_up = "Could you please provide more specific details to help me give you a precise answer?"
+            else:
+                suggested_follow_up = None
+            # Create expanded query
+            expanded_query = query
+            if extracted_district:
+                expanded_query += f" for {extracted_district} district"
+            if extracted_source:
+                expanded_query += f" from {extracted_source}"
+            if extracted_year:
+                expanded_query += f" in {extracted_year}"
+            query_analysis = QueryAnalysis(
+                has_district=has_district,
+                has_source=has_source,
+                has_year=has_year,
+                extracted_district=extracted_district,
+                extracted_source=extracted_source,
+                extracted_year=extracted_year,
+                confidence_score=confidence_score,
+                can_answer_directly=can_answer_directly,
+                missing_filters=missing_filters,
+                suggested_follow_up=suggested_follow_up,
+                expanded_query=expanded_query
+            )
+        # Update conversation context
+        if query_analysis.extracted_district:
+            conversation_context["district"] = query_analysis.extracted_district
+        if query_analysis.extracted_source:
+            conversation_context["source"] = query_analysis.extracted_source
+        if query_analysis.extracted_year:
+            conversation_context["year"] = query_analysis.extracted_year
+        state["query_analysis"] = query_analysis
+        state["conversation_context"] = conversation_context
+        self.logger.info(f"✅ ANALYSIS COMPLETE: district={query_analysis.has_district}, source={query_analysis.has_source}, year={query_analysis.has_year}")
+        self.logger.info(f"📈 Confidence: {query_analysis.confidence_score:.2f}, Can answer directly: {query_analysis.can_answer_directly}")
+        if query_analysis.expanded_query:
+            self.logger.info(f"🔄 Expanded query: {query_analysis.expanded_query}")
+        return state
+    def _decide_action(self, state: ConversationState) -> ConversationState:
+        """Decide what action to take based on query analysis"""
+        analysis = state["query_analysis"]
+        # Add decision reasoning
+        if analysis.can_answer_directly and analysis.confidence_score > 0.7:
+            self.logger.info(f"🚀 DECISION: Query is complete, proceeding with RAG")
+            self.logger.info(f"📊 REASONING: Confidence={analysis.confidence_score:.2f}, Missing filters={len(analysis.missing_filters or [])}")
+            if analysis.missing_filters:
+                self.logger.info(f"📋 Missing: {', '.join(analysis.missing_filters)}")
+            else:
+                self.logger.info(f"✅ All required information available")
+        else:
+            self.logger.info(f"❓ DECISION: Query incomplete, asking follow-up")
+            self.logger.info(f"📊 REASONING: Confidence={analysis.confidence_score:.2f}, Missing filters={len(analysis.missing_filters or [])}")
+            if analysis.missing_filters:
+                self.logger.info(f"📋 Missing: {', '.join(analysis.missing_filters)}")
+            self.logger.info(f"💡 Follow-up needed: {analysis.suggested_follow_up}")
+        return state
+    def _should_perform_rag(self, state: ConversationState) -> str:
+        """Determine whether to perform RAG or ask follow-up"""
+        analysis = state["query_analysis"]
+        conversation_context = state.get("conversation_context", {})
+        recent_messages = state.get("messages", [])
+        # Check if we have enough context from conversation history
+        has_district_context = analysis.has_district or conversation_context.get("district")
+        has_source_context = analysis.has_source or conversation_context.get("source")
+        has_year_context = analysis.has_year or conversation_context.get("year")
+        # Count how many context pieces we have
+        context_count = sum([bool(has_district_context), bool(has_source_context), bool(has_year_context)])
+        # For PDM queries, we need more specific information
+        current_query = state["current_query"].lower()
+        recent_messages = state.get("messages", [])
+        # Check if this is a PDM query by looking at current query OR recent conversation
+        is_pdm_query = "pdm" in current_query or "parish development" in current_query
+        # Also check recent messages for PDM context
+        if not is_pdm_query and recent_messages:
+            for msg in recent_messages[-3:]:  # Check last 3 messages
+                if isinstance(msg, HumanMessage) and ("pdm" in msg.content.lower() or "parish development" in msg.content.lower()):
+                    is_pdm_query = True
+                    break
+        if is_pdm_query:
+            # For PDM queries, we need district AND year to be specific enough
+            # But we need them to be explicitly provided in the current conversation, not just inferred
+            if has_district_context and has_year_context:
+                # Check if both district and year are explicitly mentioned in recent messages
+                explicit_district = False
+                explicit_year = False
+                for msg in recent_messages[-3:]:  # Check last 3 messages
+                    if isinstance(msg, HumanMessage):
+                        content = msg.content.lower()
+                        if any(district in content for district in ["gulu", "kalangala", "kampala", "namutumba"]):
+                            explicit_district = True
+                        if any(year in content for year in ["2022", "2023", "2022/23", "2023/24"]):
+                            explicit_year = True
+                if explicit_district and explicit_year:
+                    self.logger.info(f"🚀 DECISION: PDM query with explicit district and year, proceeding with RAG")
+                    self.logger.info(f"📊 REASONING: PDM query - explicit_district={explicit_district}, explicit_year={explicit_year}")
+                    return "rag"
+                else:
+                    self.logger.info(f"❓ DECISION: PDM query needs explicit district and year, asking follow-up")
+                    self.logger.info(f"📊 REASONING: PDM query - explicit_district={explicit_district}, explicit_year={explicit_year}")
+                    return "follow_up"
+            else:
+                self.logger.info(f"❓ DECISION: PDM query needs more specific info, asking follow-up")
+                self.logger.info(f"📊 REASONING: PDM query - district={has_district_context}, year={has_year_context}")
+                return "follow_up"
+        # For general queries, be more conservative - need at least 2 pieces AND high confidence
+        if context_count >= 2 and analysis.confidence_score > 0.8:
+            self.logger.info(f"🚀 DECISION: Sufficient context with high confidence, proceeding with RAG")
+            self.logger.info(f"📊 REASONING: Context pieces: district={has_district_context}, source={has_source_context}, year={has_year_context}, confidence={analysis.confidence_score}")
+            return "rag"
+        # If user seems frustrated (short responses like "no"), proceed with RAG
+        if recent_messages and len(recent_messages) >= 3:  # Need more messages to detect frustration
+            last_user_message = None
+            for msg in reversed(recent_messages):
+                if isinstance(msg, HumanMessage):
+                    last_user_message = msg.content.lower().strip()
+                    break
+            if last_user_message and len(last_user_message) < 10 and any(word in last_user_message for word in ["no", "yes", "ok", "sure"]):
+                self.logger.info(f"🚀 DECISION: User seems frustrated with short response, proceeding with RAG")
+                return "rag"
+        # Original logic for direct answers
+        if analysis.can_answer_directly and analysis.confidence_score > 0.7:
+            return "rag"
+        else:
+            return "follow_up"
+    def _ask_follow_up(self, state: ConversationState) -> ConversationState:
+        """Generate a follow-up question to clarify missing information"""
+        analysis = state["query_analysis"]
+        current_query = state["current_query"].lower()
+        conversation_context = state.get("conversation_context", {})
+        # Check if this is a PDM query
+        is_pdm_query = "pdm" in current_query or "parish development" in current_query
+        if is_pdm_query:
+            # Generate PDM-specific follow-up questions
+            missing_info = []
+            if not analysis.has_district and not conversation_context.get("district"):
+                missing_info.append("district (e.g., Gulu, Kalangala)")
+            if not analysis.has_year and not conversation_context.get("year"):
+                missing_info.append("year (e.g., 2022, 2023)")
+            if missing_info:
+                follow_up_message = f"For PDM administrative costs information, I need to know the {', '.join(missing_info)}. Could you please specify these details?"
+            else:
+                follow_up_message = "Could you please provide more specific details about the PDM administrative costs you're looking for?"
+        else:
+            # Use the original follow-up logic
+            if analysis.suggested_follow_up:
+                follow_up_message = analysis.suggested_follow_up
+            else:
+                follow_up_message = "Could you please provide more specific details to help me give you a precise answer?"
+        state["final_response"] = follow_up_message
+        state["last_ai_message_time"] = time.time()
+        return state
+    def _build_comprehensive_query(self, current_query: str, analysis, conversation_context: dict, recent_messages: list) -> str:
+        """Build a better RAG query from conversation.
+        - If latest message is a short modifier (e.g., "financial"), merge it into the last substantive question.
+        - If latest message looks like filters (district/year), keep the last question unchanged.
+        - Otherwise, use the current message.
+        """
+        def is_interrogative(text: str) -> bool:
+            t = text.lower().strip()
+            return any(t.startswith(w) for w in ["what", "how", "why", "when", "where", "which", "who"]) or t.endswith("?")
+        def is_filter_like(text: str) -> bool:
+            t = text.lower()
+            if "district" in t:
+                return True
+            if re.search(r"\b20\d{2}\b", t) or re.search(r"20\d{2}\s*[\-/–]\s*\d{2}\b", t):
+                return True
+            if self._extract_districts_list(text):
+                return True
+            return False
+        # Find last substantive user question
+        last_question = None
+        for msg in reversed(recent_messages[:-1] if recent_messages else []):
+            if isinstance(msg, HumanMessage):
+                if is_interrogative(msg.content) and len(msg.content.strip()) > 15:
+                    last_question = msg.content.strip()
+                    break
+        cq = current_query.strip()
+        words = cq.split()
+        is_short_modifier = (not is_interrogative(cq)) and (len(words) <= 3)
+        if is_filter_like(cq) and last_question:
+            comprehensive_query = last_question
+        elif is_short_modifier and last_question:
+            modifier = cq
+            if modifier.lower() in last_question.lower():
+                comprehensive_query = last_question
+            else:
+                if last_question.endswith('?'):
+                    comprehensive_query = last_question[:-1] + f" for {modifier}?"
+                else:
+                    comprehensive_query = last_question + f" for {modifier}"
+        else:
+            comprehensive_query = current_query
+        self.logger.info(f"🔄 COMPREHENSIVE QUERY: '{comprehensive_query}'")
+        return comprehensive_query
+    def _rewrite_query_with_llm(self, recent_messages: list, draft_query: str) -> str:
+        """Use the LLM to rewrite a clean, focused RAG query from the conversation.
+        Rules enforced in prompt:
+        - Keep the user's main information need from the last substantive question
+        - Integrate short modifiers (e.g., "financial") into that question when appropriate
+        - Do NOT include filter text (years/districts/sources) in the query; those are handled separately
+        - Return a single plain sentence only (no quotes, no markdown)
+        """
+        try:
+            # Build a compact conversation transcript (last 6 messages max)
+            convo_lines = []
+            for msg in recent_messages[-6:]:
+                if isinstance(msg, HumanMessage):
+                    convo_lines.append(f"User: {msg.content}")
+                elif isinstance(msg, AIMessage):
+                    convo_lines.append(f"Assistant: {msg.content}")
+            convo_text = "\n".join(convo_lines)
+            """
+                                "DECISION GUIDANCE:\n"
+                    "- If the latest user message looks like a modifier (e.g., 'financial'), merge it into the best prior question.\n"
+                    "- If the latest message provides filters (e.g., districts, years), DO NOT embed them; keep the base question.\n"
+                    "- If the latest message itself is a full, clear question, use it.\n"
+                    "- If the draft query is already good, you may refine its clarity but keep the same intent.\n\n"
+            """
+            prompt = ChatPromptTemplate.from_messages([
+                SystemMessage(content=(
+                    "ROLE: Query Rewriter for a RAG system.\n\n"
+                    "PRIMARY OBJECTIVE:\n- Produce ONE retrieval-focused sentence that best represents the user's information need.\n"
+                    "- Maximize recall of relevant evidence; be specific but not overconstrained.\n\n"
+                    "INPUTS:\n- Conversation with User and Assistant turns (latest last).\n- A draft query (heuristic).\n\n"
+                    "OPERATING PRINCIPLES:\n"
+                    "1) Use the last substantive USER question as the backbone of intent.\n"
+                    "2) Merge helpful domain modifiers from any USER turns (financial, procurement, risk) when they sharpen focus; ignore if not helpful.\n"
+                    "3) Treat Assistant messages as guidance only; if the user later provided filters (years, districts, sources), DO NOT embed them in the query (filters are applied separately).\n"
+                    "4) Remove meta-verbs like 'summarize', 'list', 'explain', 'compare' from the query.\n"
+                    "5) Prefer content-bearing terms (topics, programs, outcomes) over task phrasing.\n"
+                    "6) If the latest user message is filters-only, keep the prior substantive question unchanged.\n"
+                    "7) If the draft query is already strong, refine wording for clarity but keep the same intent.\n\n"
+                    "EXAMPLES (multi-turn):\n"
+                    "A)\nUser: What are the top 5 priorities for improving audit procedures?\nAssistant: Could you specify the scope (e.g., financial, procurement)?\nUser: Financial\n→ Output: Top priorities for improving financial audit procedures.\n\n"
+                    "B)\nUser: How were PDM administrative costs utilized and what was the impact of shortfalls?\nAssistant: Please specify district/year for precision.\nUser: Namutumba and Lwengo Districts (2022/23)\n→ Output: How were PDM administrative costs utilized and what was the impact of shortfalls.\n(Exclude districts/years; they are filters.)\n\n"
+                    "C)\nUser: Summarize risk management issues in audit reports.\n→ Output: Key risk management issues in audit reports.\n\n"
+                    "CONSTRAINTS:\n- Do NOT include filters (years, districts, sources, filenames).\n- Do NOT include quotes/markdown/bullets or multiple sentences.\n- Return exactly one plain sentence."
+                )),
+                HumanMessage(content=(
+                    f"Conversation (most recent last):\n{convo_text}\n\n"
+                    f"Draft query: {draft_query}\n\n"
+                    "Rewrite the single best retrieval query sentence now:"
+                )),
+            ])
+            # Add timeout for LLM call
+            import signal
+            def timeout_handler(signum, frame):
+                raise TimeoutError("LLM rewrite timeout")
+            # Set 10 second timeout
+            signal.signal(signal.SIGALRM, timeout_handler)
+            signal.alarm(10)
+            try:
+                resp = self.llm.invoke(prompt.format_messages())
+                signal.alarm(0)  # Cancel timeout
+                rewritten = getattr(resp, 'content', '').strip()
+                # Basic sanitization: keep it one line
+                rewritten = rewritten.replace('\n', ' ').strip()
+                if rewritten and len(rewritten) > 5:  # Basic quality check
+                    self.logger.info(f"🛠️ LLM REWRITER: '{rewritten}'")
+                    return rewritten
+                else:
+                    self.logger.info(f"⚠️ LLM rewrite too short/empty, using draft query")
+                    return draft_query
+            except TimeoutError:
+                signal.alarm(0)
+                self.logger.info(f"⚠️ LLM rewrite timeout after 10s, using draft query")
+                return draft_query
+            except Exception as e:
+                signal.alarm(0)
+                self.logger.info(f"⚠️ LLM rewrite failed, using draft query. Error: {e}")
+                return draft_query
+        except Exception as e:
+            self.logger.info(f"⚠️ LLM rewrite setup failed, using draft query. Error: {e}")
+        return draft_query
+    def _perform_rag(self, state: ConversationState) -> ConversationState:
+        """Perform RAG retrieval with smart query expansion"""
+        query = state["current_query"]
+        analysis = state["query_analysis"]
+        conversation_context = state.get("conversation_context", {})
+        recent_messages = state.get("messages", [])
+        # Build comprehensive query from conversation history
+        draft_query = self._build_comprehensive_query(query, analysis, conversation_context, recent_messages)
+        # Let LLM rewrite a clean, focused search query
+        search_query = self._rewrite_query_with_llm(recent_messages, draft_query)
+        self.logger.info(f"🔍 RAG RETRIEVAL: Starting for query: '{search_query[:50]}...'")
+        self.logger.info(f"📊 Analysis: district={analysis.has_district}, source={analysis.has_source}, year={analysis.has_year}")
+        try:
+            # Build filters from analysis and conversation context
+            filters = {}
+            # Use conversation context to fill in missing filters
+            source = analysis.extracted_source or conversation_context.get("source")
+            district = analysis.extracted_district or conversation_context.get("district")
+            year = analysis.extracted_year or conversation_context.get("year")
+            if source:
+                filters["source"] = [source]  # Qdrant expects lists
+                self.logger.info(f"🎯 Filter: source={source}")
+            if year:
+                filters["year"] = [year]
+                self.logger.info(f"🎯 Filter: year={year}")
+            if district:
+                # Map district to source if needed
+                if district.upper() == "KAMPALA":
+                    filters["source"] = ["KCCA"]
+                    self.logger.info(f"🎯 Filter: district={district} -> source=KCCA")
+                elif district.upper() in ["GULU", "KALANGALA"]:
+                    filters["source"] = [f"{district.upper()} DLG"]
+                    self.logger.info(f"🎯 Filter: district={district} -> source={district.upper()} DLG")
+            # Run RAG pipeline with correct parameters
+            result = self.pipeline_manager.run(
+                query=search_query,  # Use expanded query
+                sources=filters.get("source") if filters.get("source") else None,
+                auto_infer_filters=False,  # Our agent already handled filter inference
+                filters=filters if filters else None
+            )
+            self.logger.info(f"✅ RAG completed: Found {len(result.sources)} sources")
+            self.logger.info(f"⏱️ Execution time: {result.execution_time:.2f}s")
+            # Store RAG result in state
+            state["rag_result"] = result
+            state["rag_query"] = search_query
+        except Exception as e:
+            self.logger.info(f"❌ RAG retrieval failed: {e}")
+            state["rag_result"] = None
+        return state
+    def _generate_response(self, state: ConversationState) -> ConversationState:
+        """Generate final response using RAG results"""
+        rag_result = state["rag_result"]
+        self.logger.info(f"📝 RESPONSE: Using RAG result ({len(rag_result.answer)} chars)")
+        # Store the final response directly from RAG
+        state["final_response"] = rag_result.answer
+        state["last_ai_message_time"] = time.time()
+        return state
+    def chat(self, user_input: str, conversation_id: str = "default") -> str:
+        """Main chat interface with conversation management"""
+        self.logger.info(f"💬 CHAT: Processing user input: '{user_input[:50]}...'")
+        self.logger.info(f"📊 Session: {conversation_id}")
+        # Load conversation history
+        conversation_file = self.conversations_dir / f"{conversation_id}.json"
+        conversation = self._load_conversation(conversation_file)
+        # Add user message to conversation
+        conversation["messages"].append(HumanMessage(content=user_input))
+        self.logger.info(f"🔄 LANGGRAPH: Starting graph execution")
+        # Prepare state for LangGraph with conversation context
+        state = ConversationState(
+            conversation_id=conversation_id,
+            messages=conversation["messages"],
+            current_query=user_input,
+            query_analysis=None,
+            conversation_context=conversation.get("context", {}),
+            rag_result=None,
+            final_response=None,
+            session_start_time=conversation["session_start_time"],
+            last_ai_message_time=conversation["last_ai_message_time"]
+        )
+        # Run the graph
+        final_state = self.graph.invoke(state)
+        # Add the AI response to conversation
+        if final_state["final_response"]:
+            conversation["messages"].append(AIMessage(content=final_state["final_response"]))
+        # Update conversation state
+        conversation["last_ai_message_time"] = final_state["last_ai_message_time"]
+        conversation["context"] = final_state["conversation_context"]
+        # Save conversation
+        self._save_conversation(conversation_file, conversation)
+        self.logger.info(f"✅ LANGGRAPH: Graph execution completed")
+        self.logger.info(f"🎯 CHAT COMPLETE: Response ready")
+        # Return both response and RAG result for UI
+        return {
+            'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
+            'rag_result': final_state["rag_result"],
+            'actual_rag_query': final_state.get("rag_query", "")
+        }
+    def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
+        """Load conversation from file"""
+        if conversation_file.exists():
+            try:
+                with open(conversation_file) as f:
+                    data = json.load(f)
+                    # Convert message dicts back to LangChain messages
+                    messages = []
+                    for msg_data in data.get("messages", []):
+                        if msg_data["type"] == "human":
+                            messages.append(HumanMessage(content=msg_data["content"]))
+                        elif msg_data["type"] == "ai":
+                            messages.append(AIMessage(content=msg_data["content"]))
+                    data["messages"] = messages
+                    return data
+            except Exception as e:
+                self.logger.info(f"⚠️ Could not load conversation: {e}")
+        # Return default conversation
+        return {
+            "messages": [],
+            "session_start_time": time.time(),
+            "last_ai_message_time": time.time(),
+            "context": {}
+        }
+    def _save_conversation(self, conversation_file: Path, conversation: Dict[str, Any]):
+        """Save conversation to file"""
+        try:
+            # Convert LangChain messages to serializable format
+            messages_data = []
+            for msg in conversation["messages"]:
+                if isinstance(msg, HumanMessage):
+                    messages_data.append({"type": "human", "content": msg.content})
+                elif isinstance(msg, AIMessage):
+                    messages_data.append({"type": "ai", "content": msg.content})
+            data = {
+                "messages": messages_data,
+                "session_start_time": conversation["session_start_time"],
+                "last_ai_message_time": conversation["last_ai_message_time"],
+                "context": conversation.get("context", {}),
+                "last_updated": datetime.now().isoformat()
+            }
+            with open(conversation_file, "w") as f:
+                json.dump(data, f, indent=2)
+        except Exception as e:
+            self.logger.info(f"⚠️ Could not save conversation: {e}")
+def get_chatbot():
+    """Get chatbot instance"""
+    return IntelligentRAGChatbot()
+if __name__ == "__main__":
+    # Test the chatbot
+    chatbot = IntelligentRAGChatbot()
+    # Test conversation
+    test_queries = [
+        "How much was the budget allocation for government salary payroll management?",
+        "Namutumba district in 2023",
+        "KCCA"
+    ]
+    for query in test_queries:
+        self.logger.info(f"\n{'='*50}")
+        self.logger.info(f"User: {query}")
+        response = chatbot.chat(query)
+        self.logger.info(f"Bot: {response}")

src/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Audit QA Refactored Module
+A modular and maintainable RAG pipeline for audit report analysis.
+"""
+from .pipeline import PipelineManager
+from .config.loader import load_config
+__version__ = "2.0.0"
+__all__ = ["PipelineManager", "load_config"]

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Configuration management for Audit QA."""
+from .loader import load_config, get_nested_config
+__all__ = ["load_config", "get_nested_config"]

src/config/collections.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "docling": {
+    "model": "BAAI/bge-m3",
+    "description": "Default collection with BGE-M3 embedding model"
+  },
+  "modernbert-embed-base-akryl-matryoshka": {
+    "model": "Akryl/modernbert-embed-base-akryl-matryoshka",
+    "description": "ModernBERT embedding model with matryoshka representation"
+  },
+  "sentence-transformers-all-MiniLM-L6-v2": {
+    "model": "sentence-transformers/all-MiniLM-L6-v2",
+    "description": "Sentence transformers MiniLM model"
+  },
+  "sentence-transformers-all-mpnet-base-v2": {
+    "model": "sentence-transformers/all-mpnet-base-v2",
+    "description": "Sentence transformers MPNet model"
+  },
+  "BAAI-bge-m3": {
+    "model": "BAAI/bge-m3",
+    "description": "BAAI BGE-M3 multilingual embedding model"
+  }
+}

src/config/loader.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Configuration loader for YAML settings."""
+import yaml
+import json
+from pathlib import Path
+from typing import Dict, Any, Optional
+from dotenv import load_dotenv
+import os
+load_dotenv()
+def load_config(config_path: str = None) -> Dict[str, Any]:
+    """
+    Load configuration from YAML file.
+    Args:
+        config_path: Path to config file. If None, uses default settings.yaml
+    Returns:
+        Dictionary containing configuration settings
+    """
+    if config_path is None:
+        # Default to settings.yaml in the same directory as this file
+        config_path = Path(__file__).parent / "settings.yaml"
+    config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Replace environment variables in the content
+    import os
+    import re
+    def replace_env_vars(match):
+        env_var = match.group(1)
+        return os.getenv(env_var, match.group(0))  # Return original if env var not found
+    # Replace ${VAR} patterns with environment variables
+    content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content)
+    config = yaml.safe_load(content)
+    # Override with environment variables if they exist
+    config = _override_with_env_vars(config)
+    return config
+def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Override config values with environment variables where available."""
+    # Map environment variables to config paths
+    env_mappings = {
+        'QDRANT_URL': ['qdrant', 'url'],
+        'QDRANT_COLLECTION': ['qdrant', 'collection_name'],
+        'QDRANT_API_KEY': ['qdrant', 'api_key'],
+        'RETRIEVER_MODEL': ['retriever', 'model'],
+        'RANKER_MODEL': ['ranker', 'model'],
+        'READER_TYPE': ['reader', 'default_type'],
+        'MAX_TOKENS': ['reader', 'max_tokens'],
+        'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'],
+        'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'],
+        'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'],
+        'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'],
+        'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'],
+        'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'],
+        'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'],
+    }
+    for env_var, config_path in env_mappings.items():
+        env_value = os.getenv(env_var)
+        if env_value:
+            # Navigate to the nested config location
+            current = config
+            for key in config_path[:-1]:
+                if key not in current:
+                    current[key] = {}
+                current = current[key]
+            # Set the final value, converting to appropriate type
+            final_key = config_path[-1]
+            if final_key in ['top_k', 'max_tokens', 'num_predict']:
+                current[final_key] = int(env_value)
+            elif final_key in ['normalize', 'prefer_grpc']:
+                current[final_key] = env_value.lower() in ('true', '1', 'yes')
+            elif final_key == 'temperature':
+                current[final_key] = float(env_value)
+            else:
+                current[final_key] = env_value
+    return config
+def get_nested_config(config: Dict[str, Any], path: str, default=None):
+    """
+    Get a nested configuration value using dot notation.
+    Args:
+        config: Configuration dictionary
+        path: Dot-separated path (e.g., 'reader.MISTRAL.model')
+        default: Default value if path not found
+    Returns:
+        Configuration value or default
+    """
+    keys = path.split('.')
+    current = config
+    try:
+        for key in keys:
+            current = current[key]
+        return current
+    except (KeyError, TypeError):
+        return default
+def load_collections_mapping() -> Dict[str, Dict[str, str]]:
+    """Load collections mapping from JSON file."""
+    collections_file = Path(__file__).parent / "collections.json"
+    if not collections_file.exists():
+        # Return default mapping if file doesn't exist
+        return {
+            "docling": {
+                "model": "sentence-transformers/all-MiniLM-L6-v2",
+                "description": "Default collection"
+            }
+        }
+    with open(collections_file, 'r') as f:
+        return json.load(f)
+def get_embedding_model_for_collection(collection_name: str) -> Optional[str]:
+    """Get embedding model for a specific collection name."""
+    collections = load_collections_mapping()
+    if collection_name in collections:
+        return collections[collection_name]["model"]
+    # Try to infer from collection name patterns
+    if "modernbert" in collection_name.lower():
+        return "Akryl/modernbert-embed-base-akryl-matryoshka"
+    elif "minilm" in collection_name.lower():
+        return "sentence-transformers/all-MiniLM-L6-v2"
+    elif "mpnet" in collection_name.lower():
+        return "sentence-transformers/all-mpnet-base-v2"
+    elif "bge" in collection_name.lower():
+        return "BAAI/bge-m3"
+    return None
+def get_collection_info(collection_name: str) -> Dict[str, str]:
+    """Get full collection information including model and description."""
+    collections = load_collections_mapping()
+    if collection_name in collections:
+        return collections[collection_name]
+    # Return inferred info for unknown collections
+    model = get_embedding_model_for_collection(collection_name)
+    return {
+        "model": model or "unknown",
+        "description": f"Auto-inferred collection: {collection_name}"
+    }

src/config/settings.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+# Audit QA Configuration
+# Converted from model_params.cfg to YAML format
+qdrant:
+  # url: "http://10.1.4.192:8803"`
+  url: "https://2c6d0136-b6ca-4400-bac5-1703f58abc43.europe-west3-0.gcp.cloud.qdrant.io"
+  collection_name: "docling"
+  prefer_grpc: true
+  api_key: "${QDRANT_API_KEY}"  # Load from environment variable
+retriever:
+  model: "BAAI/bge-m3"
+  normalize: true
+  top_k: 20
+retrieval:
+  use_reranking: true
+  reranker_model: "BAAI/bge-reranker-v2-m3"
+  reranker_top_k: 5
+ranker:
+  model: "BAAI/bge-reranker-v2-m3"
+  top_k: 5
+bm25:
+  top_k: 20
+hybrid:
+  default_mode: "vector_only"  # Options: vector_only, sparse_only, hybrid
+  default_alpha: 0.5           # Weight for vector scores (0.5 = equal weight)
+reader:
+  default_type: "OPENAI"
+  max_tokens: 768
+  # Different LLM provider configurations
+  INF_PROVIDERS:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    provider: "nebius"
+  # Not working
+  NVIDIA:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    endpoint: "https://huggingface.co/api/integrations/dgx/v1"
+  # Not working
+  DEDICATED:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    endpoint: "https://qu2d8m6dmsollhly.us-east-1.aws.endpoints.huggingface.cloud"
+  MISTRAL:
+    model: "mistral-medium-latest"
+  OPENAI:
+    model: "gpt-4o-mini"
+  OLLAMA:
+    model: "mistral-small3.1:24b-instruct-2503-q8_0"
+    base_url: "http://10.1.4.192:11434/"
+    temperature: 0.8
+    num_predict: 256
+  OPENROUTER:
+    model: "moonshotai/kimi-k2:free"
+    base_url: "https://openrouter.ai/api/v1"
+    temperature: 0.7
+    max_tokens: 1000
+  #   site_url: "https://your-site.com"  # optional, for OpenRouter ranking
+  #   site_name: "Your Site Name"  # optional, for OpenRouter ranking
+app:
+  dropdown_default: "Annual Consolidated OAG 2024"
+# File paths
+paths:
+  chunks_file: "reports/docling_chunks.json"
+  reports_dir: "reports"
+# Feature toggles
+features:
+  enable_session: true
+  enable_logging: true
+# Logging and HuggingFace scheduler configuration
+logging:
+  json_dataset_dir: "json_dataset"
+  huggingface:
+    repo_id: "GIZ/spaces_logs"
+    repo_type: "dataset"
+    folder_path: "json_dataset"
+    path_in_repo: "audit_chatbot"
+    token_env_var: "SPACES_LOG"

src/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""LLM adapters and utilities."""
+from .adapters import LLMRegistry, get_llm_client
+from .templates import get_message_template, PromptTemplate, create_audit_prompt
+__all__ = ["LLMRegistry", "get_llm_client", "get_message_template", "PromptTemplate", "create_audit_prompt"]

src/llm/adapters.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""LLM client adapters for different providers."""
+from typing import Dict, Any, List, Optional, Union
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+# LangChain imports
+from langchain_mistralai.chat_models import ChatMistralAI
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_ollama import ChatOllama
+# Legacy client dependencies
+from huggingface_hub import InferenceClient
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.chat_models.huggingface import ChatHuggingFace
+# Configuration loader
+from ..config.loader import load_config
+# Load configuration once at module level
+_config = load_config()
+# Legacy client factory functions (inlined from auditqa_old.reader)
+def _create_inf_provider_client():
+    """Create INF_PROVIDERS client."""
+    reader_config = _config.get("reader", {})
+    inf_config = reader_config.get("INF_PROVIDERS", {})
+    api_key = inf_config.get("api_key")
+    if not api_key:
+        raise ValueError("INF_PROVIDERS api_key not found in configuration")
+    provider = inf_config.get("provider")
+    if not provider:
+        raise ValueError("INF_PROVIDERS provider not found in configuration")
+    return InferenceClient(
+        provider=provider,
+        api_key=api_key,
+        bill_to="GIZ",
+    )
+def _create_nvidia_client():
+    """Create NVIDIA client."""
+    reader_config = _config.get("reader", {})
+    nvidia_config = reader_config.get("NVIDIA", {})
+    api_key = nvidia_config.get("api_key")
+    if not api_key:
+        raise ValueError("NVIDIA api_key not found in configuration")
+    endpoint = nvidia_config.get("endpoint")
+    if not endpoint:
+        raise ValueError("NVIDIA endpoint not found in configuration")
+    return InferenceClient(
+        base_url=endpoint,
+        api_key=api_key
+    )
+def _create_serverless_client():
+    """Create serverless API client."""
+    reader_config = _config.get("reader", {})
+    serverless_config = reader_config.get("SERVERLESS", {})
+    api_key = serverless_config.get("api_key")
+    if not api_key:
+        raise ValueError("SERVERLESS api_key not found in configuration")
+    model_id = serverless_config.get("model", "meta-llama/Meta-Llama-3-8B-Instruct")
+    return InferenceClient(
+        model=model_id,
+        api_key=api_key,
+    )
+def _create_dedicated_endpoint_client():
+    """Create dedicated endpoint client."""
+    reader_config = _config.get("reader", {})
+    dedicated_config = reader_config.get("DEDICATED", {})
+    api_key = dedicated_config.get("api_key")
+    if not api_key:
+        raise ValueError("DEDICATED api_key not found in configuration")
+    endpoint = dedicated_config.get("endpoint")
+    if not endpoint:
+        raise ValueError("DEDICATED endpoint not found in configuration")
+    max_tokens = dedicated_config.get("max_tokens", 768)
+    # Set up the streaming callback handler
+    callback = StreamingStdOutCallbackHandler()
+    # Initialize the HuggingFaceEndpoint with streaming enabled
+    llm_qa = HuggingFaceEndpoint(
+        endpoint_url=endpoint,
+        max_new_tokens=int(max_tokens),
+        repetition_penalty=1.03,
+        timeout=70,
+        huggingfacehub_api_token=api_key,
+        streaming=True,
+        callbacks=[callback]
+    )
+    # Create a ChatHuggingFace instance with the streaming-enabled endpoint
+    return ChatHuggingFace(llm=llm_qa)
+@dataclass
+class LLMResponse:
+    """Standardized LLM response format."""
+    content: str
+    model: str
+    provider: str
+    metadata: Dict[str, Any] = None
+class BaseLLMAdapter(ABC):
+    """Base class for LLM adapters."""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+    @abstractmethod
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response from messages."""
+        pass
+    @abstractmethod
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response from messages."""
+        pass
+class MistralAdapter(BaseLLMAdapter):
+    """Adapter for Mistral AI models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatMistralAI(
+            model=config.get("model", "mistral-medium-latest")
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using Mistral."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "mistral-medium-latest"),
+            provider="mistral",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using Mistral."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OpenAIAdapter(BaseLLMAdapter):
+    """Adapter for OpenAI models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatOpenAI(
+            model=config.get("model", "gpt-4o-mini")
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using OpenAI."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "gpt-4o-mini"),
+            provider="openai",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using OpenAI."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OllamaAdapter(BaseLLMAdapter):
+    """Adapter for Ollama models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatOllama(
+            model=config.get("model", "mistral-small3.1:24b-instruct-2503-q8_0"),
+            base_url=config.get("base_url", "http://localhost:11434/"),
+            temperature=config.get("temperature", 0.8),
+            num_predict=config.get("num_predict", 256)
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using Ollama."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "mistral-small3.1:24b-instruct-2503-q8_0"),
+            provider="ollama",
+            metadata={}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using Ollama."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OpenRouterAdapter(BaseLLMAdapter):
+    """Adapter for OpenRouter models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        # Prepare custom headers for OpenRouter (optional)
+        headers = {}
+        if config.get("site_url"):
+            headers["HTTP-Referer"] = config["site_url"]
+        if config.get("site_name"):
+            headers["X-Title"] = config["site_name"]
+        # Initialize ChatOpenAI with OpenRouter configuration
+        self.model = ChatOpenAI(
+            model=config.get("model", "openai/gpt-3.5-turbo"),
+            api_key=config.get("api_key"),
+            base_url=config.get("base_url", "https://openrouter.ai/api/v1"),
+            default_headers= headers if headers else {},
+            temperature=config.get("temperature", 0.7),
+            max_tokens=config.get("max_tokens", 1000)
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using OpenRouter."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "openai/gpt-3.5-turbo"),
+            provider="openrouter",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using OpenRouter."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class LegacyAdapter(BaseLLMAdapter):
+    """Adapter for legacy LLM clients (INF_PROVIDERS, NVIDIA, etc.)."""
+    def __init__(self, config: Dict[str, Any], client_type: str):
+        super().__init__(config)
+        self.client_type = client_type
+        self.client = self._create_client()
+    def _create_client(self):
+        """Create legacy client based on type."""
+        if self.client_type == "INF_PROVIDERS":
+            return _create_inf_provider_client()
+        elif self.client_type == "NVIDIA":
+            return _create_nvidia_client()
+        elif self.client_type == "DEDICATED":
+            return _create_dedicated_endpoint_client()
+        else:  # SERVERLESS
+            return _create_serverless_client()
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using legacy client."""
+        max_tokens = kwargs.get('max_tokens', self.config.get('max_tokens', 768))
+        if self.client_type == "INF_PROVIDERS":
+            response = self.client.chat.completions.create(
+                model=self.config.get("model"),
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        elif self.client_type == "NVIDIA":
+            response = self.client.chat_completion(
+                model=self.config.get("model"),
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        else:  # DEDICATED or SERVERLESS
+            response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        return LLMResponse(
+            content=content,
+            model=self.config.get("model", "unknown"),
+            provider=self.client_type.lower(),
+            metadata={}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using legacy client."""
+        # Legacy clients may not support streaming in the same way
+        # This is a simplified implementation
+        response = self.generate(messages, **kwargs)
+        words = response.content.split()
+        for word in words:
+            yield word + " "
+class LLMRegistry:
+    """Registry for managing different LLM adapters."""
+    def __init__(self):
+        self.adapters = {}
+        self.adapter_configs = {}
+    def register_adapter(self, name: str, adapter_class: type, config: Dict[str, Any]):
+        """Register an LLM adapter (lazy instantiation)."""
+        self.adapter_configs[name] = (adapter_class, config)
+    def get_adapter(self, name: str) -> BaseLLMAdapter:
+        """Get an LLM adapter by name (lazy instantiation)."""
+        if name not in self.adapter_configs:
+            raise ValueError(f"Unknown LLM adapter: {name}")
+        # Lazy instantiation - only create when needed
+        if name not in self.adapters:
+            adapter_class, config = self.adapter_configs[name]
+            self.adapters[name] = adapter_class(config)
+        return self.adapters[name]
+    def list_adapters(self) -> List[str]:
+        """List available adapter names."""
+        return list(self.adapter_configs.keys())
+def create_llm_registry(config: Dict[str, Any]) -> LLMRegistry:
+    """
+    Create and populate LLM registry from configuration.
+    Args:
+        config: Configuration dictionary
+    Returns:
+        Populated LLMRegistry
+    """
+    registry = LLMRegistry()
+    reader_config = config.get("reader", {})
+    # Register simple adapters
+    if "MISTRAL" in reader_config:
+        registry.register_adapter("mistral", MistralAdapter, reader_config["MISTRAL"])
+    if "OPENAI" in reader_config:
+        registry.register_adapter("openai", OpenAIAdapter, reader_config["OPENAI"])
+    if "OLLAMA" in reader_config:
+        registry.register_adapter("ollama", OllamaAdapter, reader_config["OLLAMA"])
+    if "OPENROUTER" in reader_config:
+        registry.register_adapter("openrouter", OpenRouterAdapter, reader_config["OPENROUTER"])
+    # Register legacy adapters
+    # legacy_types = ["INF_PROVIDERS", "NVIDIA", "DEDICATED"]
+    legacy_types = ["INF_PROVIDERS"]
+    for legacy_type in legacy_types:
+        if legacy_type in reader_config:
+            registry.register_adapter(
+                legacy_type.lower(),
+                lambda cfg, lt=legacy_type: LegacyAdapter(cfg, lt),
+                reader_config[legacy_type]
+            )
+    return registry
+def get_llm_client(provider: str, config: Dict[str, Any]) -> BaseLLMAdapter:
+    """
+    Get LLM client for specified provider.
+    Args:
+        provider: Provider name (mistral, openai, ollama, etc.)
+        config: Configuration dictionary
+    Returns:
+        LLM adapter instance
+    """
+    registry = create_llm_registry(config)
+    return registry.get_adapter(provider)

src/llm/templates.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""LLM prompt templates and message formatting utilities."""
+from typing import List, Dict, Any, Union
+from dataclasses import dataclass
+from langchain.schema import SystemMessage, HumanMessage
+@dataclass
+class PromptTemplate:
+    """Template for managing prompts with variables."""
+    system_prompt: str
+    user_prompt_template: str
+    def format(self, **kwargs) -> tuple:
+        """Format the template with provided variables."""
+        formatted_user = self.user_prompt_template.format(**kwargs)
+        return self.system_prompt, formatted_user
+# Default system prompt for audit Q&A
+DEFAULT_AUDIT_SYSTEM_PROMPT = """
+You are AuditQ&A, an AI Assistant for audit reports. Answer questions directly and factually based on the provided context.
+Guidelines:
+- Answer directly and concisely (2-3 sentences maximum)
+- Use specific facts and numbers from the context
+- Cite sources using [Doc i] format
+- Be factual, not opinionated
+- Avoid phrases like "From my point of view", "I think", "It seems"
+Examples:
+Query: "What challenges arise from contradictory PDM implementation guidelines?"
+Context: [Retrieved documents about PDM guidelines contradictions]
+Answer: "Contradictory PDM implementation guidelines cause challenges during implementation, as entities receive numerous and often conflicting directives from different authorities. For example, guidelines on transfer of funds to PDM SACCOs differ between the PDM Secretariat and PSST, and there are conflicting directives on fund diversion from various authorities."
+Query: "What was the supplementary funding obtained for the wage budget?"
+Context: [Retrieved documents about wage budget funding]
+Answer: "The supplementary funding obtained for the wage budget was UGX.2,208,040,656."
+Now answer the following question based on the provided context:
+"""
+# Default user prompt template
+DEFAULT_USER_PROMPT_TEMPLATE = """Passages:
+{context}
+-----------------------
+Question: {question} - Explained to audit expert
+Answer in english with the passages citations:
+"""
+def create_audit_prompt(context_list: List[str], query: str) -> List[Dict[str, str]]:
+    """
+    Create audit Q&A prompt messages from context and query.
+    Args:
+        context_list: List of context passages
+        query: User query
+    Returns:
+        List of message dictionaries for LLM
+    """
+    # Join context passages with numbering
+    numbered_context = []
+    for i, passage in enumerate(context_list, 1):
+        numbered_context.append(f"Doc {i}: {passage}")
+    context_str = "\n\n".join(numbered_context)
+    # Format user prompt
+    user_prompt = DEFAULT_USER_PROMPT_TEMPLATE.format(
+        context=context_str,
+        question=query
+    )
+    # Return as message format
+    messages = [
+        {"role": "system", "content": DEFAULT_AUDIT_SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt}
+    ]
+    return messages
+def get_message_template(
+    provider_type: str,
+    system_prompt: str,
+    user_prompt: str
+) -> List[Union[Dict[str, str], SystemMessage, HumanMessage]]:
+    """
+    Get message template based on LLM provider type.
+    Args:
+        provider_type: Type of LLM provider
+        system_prompt: System prompt content
+        user_prompt: User prompt content
+    Returns:
+        List of messages in the appropriate format for the provider
+    """
+    provider_type = provider_type.upper()
+    if provider_type in ['NVIDIA', 'INF_PROVIDERS', 'MISTRAL', 'OPENAI', 'OPENROUTER']:
+        # Dictionary format for API-based providers
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+    elif provider_type in ['DEDICATED', 'SERVERLESS', 'OLLAMA']:
+        # LangChain message objects for local/dedicated providers
+        messages = [
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=user_prompt)
+        ]
+    else:
+        # Default to dictionary format
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+    return messages
+def create_custom_prompt_template(
+    system_prompt: str,
+    user_template: str
+) -> PromptTemplate:
+    """
+    Create a custom prompt template.
+    Args:
+        system_prompt: System prompt content
+        user_template: User prompt template with placeholders
+    Returns:
+        PromptTemplate instance
+    """
+    return PromptTemplate(
+        system_prompt=system_prompt,
+        user_prompt_template=user_template
+    )
+def create_evaluation_prompt(context_list: List[str], query: str, expected_answer: str) -> List[Dict[str, str]]:
+    """
+    Create prompt for evaluation purposes with expected answer.
+    Args:
+        context_list: List of context passages
+        query: User query
+        expected_answer: Expected/ground truth answer
+    Returns:
+        List of message dictionaries for evaluation
+    """
+    # Join context passages
+    context_str = "\n\n".join([f"Doc {i}: {passage}" for i, passage in enumerate(context_list, 1)])
+    evaluation_system_prompt = """
+    You are an evaluation assistant. Given context passages, a question, and an expected answer,
+    evaluate how well the provided context supports answering the question accurately.
+    Provide your evaluation focusing on:
+    1. Relevance of the context to the question
+    2. Completeness of information needed to answer
+    3. Quality and accuracy of supporting details
+    """
+    user_prompt = f"""Context Passages:
+{context_str}
+Question: {query}
+Expected Answer: {expected_answer}
+Evaluation:"""
+    return [
+        {"role": "system", "content": evaluation_system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+def get_prompt_variants() -> Dict[str, PromptTemplate]:
+    """
+    Get different prompt template variants for testing.
+    Returns:
+        Dictionary of named prompt templates
+    """
+    variants = {
+        "standard": create_custom_prompt_template(
+            DEFAULT_AUDIT_SYSTEM_PROMPT,
+            DEFAULT_USER_PROMPT_TEMPLATE
+        ),
+        "concise": create_custom_prompt_template(
+            """You are an audit report AI assistant. Provide clear, concise answers based on the given context passages. Always cite sources using [Doc i] format.""",
+            """Context:\n{context}\n\nQuestion: {question}\nAnswer:"""
+        ),
+        "detailed": create_custom_prompt_template(
+            DEFAULT_AUDIT_SYSTEM_PROMPT + """\n\nAdditional Instructions:
+            - Provide detailed explanations with specific examples
+            - Include relevant numbers, dates, and financial figures when available
+            - Structure your response with clear headings when appropriate
+            - Explain the significance of findings in the context of governance and accountability""",
+            DEFAULT_USER_PROMPT_TEMPLATE
+        )
+    }
+    return variants
+# Backward compatibility function
+def format_context_with_citations(context_list: List[str]) -> str:
+    """
+    Format context list with document citations.
+    Args:
+        context_list: List of context passages
+    Returns:
+        Formatted context string with citations
+    """
+    formatted_passages = []
+    for i, passage in enumerate(context_list, 1):
+        formatted_passages.append(f"Doc {i}: {passage}")
+    return "\n\n".join(formatted_passages)

src/loader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Data loading utilities for chunks and JSON files."""
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+from langchain.docstore.document import Document
+def load_json(filepath: Path | str) -> List[Dict[str, Any]]:
+    """
+    Load JSON data from file.
+    Args:
+        filepath: Path to JSON file
+    Returns:
+        List of dictionaries containing the JSON data
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"JSON file not found: {filepath}")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+def open_file(filepath: Path | str) -> str:
+    """
+    Open and read a text file.
+    Args:
+        filepath: Path to text file
+    Returns:
+        File contents as string
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"File not found: {filepath}")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]:
+    """
+    Load document chunks from JSON file.
+    Args:
+        chunks_file: Path to chunks JSON file. If None, uses default path.
+    Returns:
+        List of chunk dictionaries
+    """
+    if chunks_file is None:
+        chunks_file = Path("reports/docling_chunks.json")
+    return load_json(chunks_file)
+def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]:
+    """
+    Convert chunk dictionaries to LangChain Document objects.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        List of Document objects
+    """
+    documents = []
+    for chunk in chunks:
+        doc = Document(
+            page_content=chunk.get("content", ""),
+            metadata=chunk.get("metadata", {})
+        )
+        documents.append(doc)
+    return documents
+def validate_chunks(chunks: List[Dict[str, Any]]) -> bool:
+    """
+    Validate that chunks have required fields.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        True if valid, raises ValueError if invalid
+    """
+    required_fields = ["content", "metadata"]
+    for i, chunk in enumerate(chunks):
+        for field in required_fields:
+            if field not in chunk:
+                raise ValueError(f"Chunk {i} missing required field: {field}")
+        # Validate metadata has required fields
+        metadata = chunk["metadata"]
+        if not isinstance(metadata, dict):
+            raise ValueError(f"Chunk {i} metadata must be a dictionary")
+        # Check for common metadata fields
+        if "filename" not in metadata:
+            raise ValueError(f"Chunk {i} metadata missing 'filename' field")
+    return True

src/logging.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""Logging utilities (placeholder for legacy compatibility)."""
+import json
+import logging
+from uuid import uuid4
+from pathlib import Path
+from threading import Lock
+from datetime import datetime
+from typing import Dict, Any, Optional
+from .config import load_config
+def save_logs(
+    scheduler=None,
+    json_dataset_path: Path = None,
+    logs_data: Dict[str, Any] = None,
+    feedback: str = None
+) -> None:
+    """
+    Save logs (placeholder for legacy compatibility).
+    Args:
+        scheduler: HuggingFace scheduler (not used in refactored version)
+        json_dataset_path: Path to JSON dataset
+        logs_data: Log data dictionary
+        feedback: User feedback
+    Note:
+        This is a placeholder function for backward compatibility.
+        In the refactored version, logging would be handled differently.
+    """
+    if not is_logging_enabled():
+        return
+    try:
+        current_time = datetime.now().timestamp()
+        logs_data["time"] = str(current_time)
+        if feedback:
+            logs_data["feedback"] = feedback
+            logs_data["record_id"] = str(uuid4())
+        field_order = [
+            "record_id",
+            "session_id",
+            "time",
+            "session_duration_seconds",
+            "client_location",
+            "platform",
+            "system_prompt",
+            "sources",
+            "reports",
+            "subtype",
+            "year",
+            "question",
+            "retriever",
+            "endpoint_type",
+            "reader",
+            "docs",
+            "answer",
+            "feedback"
+        ]
+        ordered_logs = {k: logs_data.get(k) for k in field_order if k in logs_data}
+        lock = getattr(scheduler, "lock", None)
+        if lock is None:
+            lock = Lock()
+        with lock:
+            with open(json_dataset_path, 'a') as f:
+                json.dump(ordered_logs, f)
+                f.write("\n")
+                logging.info("logging done")
+    except Exception as e:
+        logging.error(f"Error saving logs: {e}")
+        raise
+def setup_logging(log_level: str = "INFO", log_file: str = None) -> None:
+    """
+    Set up logging configuration.
+    Args:
+        log_level: Logging level
+        log_file: Optional log file path
+    """
+    if not is_logging_enabled():
+        return
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper()),
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler(log_file) if log_file else logging.NullHandler()
+        ]
+    )
+def log_query_response(
+    query: str,
+    response: str,
+    metadata: Dict[str, Any] = None
+) -> None:
+    """
+    Log query and response for analysis.
+    Args:
+        query: User query
+        response: System response
+        metadata: Additional metadata
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    log_entry = {
+        "query": query,
+        "response_length": len(response),
+        "metadata": metadata or {}
+    }
+    logger.info(f"Query processed: {log_entry}")
+def log_error(error: Exception, context: Dict[str, Any] = None) -> None:
+    """
+    Log error with context.
+    Args:
+        error: Exception that occurred
+        context: Additional context information
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    error_info = {
+        "error_type": type(error).__name__,
+        "error_message": str(error),
+        "context": context or {}
+    }
+    logger.error(f"Error occurred: {error_info}")
+def log_performance_metrics(
+    operation: str,
+    duration: float,
+    metadata: Dict[str, Any] = None
+) -> None:
+    """
+    Log performance metrics.
+    Args:
+        operation: Name of the operation
+        duration: Duration in seconds
+        metadata: Additional metadata
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    metrics = {
+        "operation": operation,
+        "duration_seconds": duration,
+        "metadata": metadata or {}
+    }
+    logger.info(f"Performance metrics: {metrics}")
+def is_session_enabled() -> bool:
+    """
+    Returns True if session management is enabled, False otherwise.
+    Checks environment variable ENABLE_SESSION first, then config.
+    """
+    env = os.getenv("ENABLE_SESSION")
+    if env is not None:
+        return env.lower() in ("1", "true", "yes", "on")
+    config = load_config()
+    return config.get("features", {}).get("enable_session", True)
+def is_logging_enabled() -> bool:
+    """
+    Returns True if logging is enabled, False otherwise.
+    Checks environment variable ENABLE_LOGGING first, then config.
+    """
+    env = os.getenv("ENABLE_LOGGING")
+    if env is not None:
+        return env.lower() in ("1", "true", "yes", "on")
+    config = load_config()
+    return config.get("features", {}).get("enable_logging", True)

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,731 @@

+"""Main pipeline orchestrator for the Audit QA system."""
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Dict, Any, List, Optional
+from langchain.docstore.document import Document
+from .logging import log_error
+from .llm.adapters import LLMRegistry
+from .loader import chunks_to_documents
+from .vectorstore import VectorStoreManager
+from .retrieval.context import ContextRetriever
+from .config.loader import get_embedding_model_for_collection
+@dataclass
+class PipelineResult:
+    """Result of pipeline execution."""
+    answer: str
+    sources: List[Document]
+    execution_time: float
+    metadata: Dict[str, Any]
+    query: str = ""  # Add default value for query
+    def __post_init__(self):
+        """Post-initialization processing."""
+        if not self.query:
+            self.query = "Unknown query"
+class PipelineManager:
+    """Main pipeline manager for the RAG system."""
+    def __init__(self, config: dict = None):
+        """
+        Initialize the pipeline manager.
+        """
+        self.config = config or {}
+        self.vectorstore_manager = None
+        self.context_retriever = None  # Initialize as None
+        self.llm_client = None
+        self.report_service = None
+        self.chunks = None
+        # Initialize components
+        self._initialize_components()
+    def update_config(self, new_config: dict):
+        """
+        Update the pipeline configuration.
+        This is useful for experiments that need different settings.
+        """
+        if not isinstance(new_config, dict):
+            return
+        # Deep merge the new config with existing config
+        def deep_merge(base_dict, update_dict):
+            for key, value in update_dict.items():
+                if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
+                    deep_merge(base_dict[key], value)
+                else:
+                    base_dict[key] = value
+        deep_merge(self.config, new_config)
+        # Auto-infer embedding model from collection name if not "docling"
+        collection_name = self.config.get('qdrant', {}).get('collection_name', 'docling')
+        if collection_name != 'docling':
+            inferred_model = get_embedding_model_for_collection(collection_name)
+            if inferred_model:
+                print(f"🔍 Auto-inferred embedding model for collection '{collection_name}': {inferred_model}")
+                if 'retriever' not in self.config:
+                    self.config['retriever'] = {}
+                self.config['retriever']['model'] = inferred_model
+                # Set default normalize parameter if not present
+                if 'normalize' not in self.config['retriever']:
+                    self.config['retriever']['normalize'] = True
+                # Also update vectorstore config if it exists
+                if 'vectorstore' in self.config:
+                    self.config['vectorstore']['embedding_model'] = inferred_model
+        print(f"🔧 CONFIG UPDATED: Pipeline config updated with experiment settings")
+        # Re-initialize vectorstore manager with updated config
+        self._reinitialize_vectorstore_manager()
+    def _reinitialize_vectorstore_manager(self):
+        """Re-initialize vectorstore manager with current config."""
+        try:
+            self.vectorstore_manager = VectorStoreManager(self.config)
+            print("🔄 VectorStore manager re-initialized with updated config")
+        except Exception as e:
+            print(f"❌ Error re-initializing vectorstore manager: {e}")
+    def _get_reranker_model_name(self) -> str:
+        """
+        Get the reranker model name from configuration.
+        Returns:
+            Reranker model name or default
+        """
+        return (
+            self.config.get('retrieval', {}).get('reranker_model') or
+            self.config.get('ranker', {}).get('model') or
+            self.config.get('reranker_model') or
+            'BAAI/bge-reranker-v2-m3'
+        )
+    def _initialize_components(self):
+        """Initialize pipeline components."""
+        try:
+            # Load config if not provided
+            if not self.config:
+                from auditqa.config.loader import load_config
+                self.config = load_config()
+            # Auto-infer embedding model from collection name if not "docling"
+            collection_name = self.config.get('qdrant', {}).get('collection_name', 'docling')
+            if collection_name != 'docling':
+                inferred_model = get_embedding_model_for_collection(collection_name)
+                if inferred_model:
+                    print(f"🔍 Auto-inferred embedding model for collection '{collection_name}': {inferred_model}")
+                    if 'retriever' not in self.config:
+                        self.config['retriever'] = {}
+                    self.config['retriever']['model'] = inferred_model
+                    # Set default normalize parameter if not present
+                    if 'normalize' not in self.config['retriever']:
+                        self.config['retriever']['normalize'] = True
+                    # Also update vectorstore config if it exists
+                    if 'vectorstore' in self.config:
+                        self.config['vectorstore']['embedding_model'] = inferred_model
+            self.vectorstore_manager = VectorStoreManager(self.config)
+            self.llm_manager = LLMRegistry()
+            # Try to get LLM client using the correct method
+            self.llm_client = None
+            try:
+                # Try using get_adapter method (most likely correct)
+                self.llm_client = self.llm_manager.get_adapter("openai")
+                print("✅ LLM CLIENT: Initialized using get_adapter method")
+            except Exception as e:
+                try:
+                    # Try direct instantiation with config
+                    from auditqa.llm.adapters import get_llm_client
+                    self.llm_client = get_llm_client("openai", self.config)
+                    print("✅ LLM CLIENT: Initialized using direct get_llm_client function with config")
+                except Exception as e2:
+                    print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
+                    # Try to create a simple LLM client directly
+                    try:
+                        from langchain_openai import ChatOpenAI
+                        import os
+                        api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
+                        if api_key:
+                            self.llm_client = ChatOpenAI(
+                                model="gpt-3.5-turbo",
+                                api_key=api_key,
+                                temperature=0.1,
+                                max_tokens=1000
+                            )
+                            print("✅ LLM CLIENT: Initialized using direct ChatOpenAI")
+                        else:
+                            print("❌ LLM CLIENT: No API key available")
+                    except Exception as e3:
+                        print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
+                        self.llm_client = None
+            # Load system prompt
+            from auditqa.llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
+            self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
+            # Initialize report service
+            try:
+                from auditqa.reporting.service import ReportService
+                self.report_service = ReportService()
+            except Exception as e:
+                print(f"Warning: Could not initialize report service: {e}")
+                self.report_service = None
+        except Exception as e:
+            print(f"Warning: Error initializing components: {e}")
+    def test_retrieval(
+        self,
+        query: str,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        k: int = None,
+        search_mode: str = None,
+        search_alpha: float = None,
+        use_reranking: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Test retrieval only without LLM inference.
+        Args:
+            query: User query
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            k: Number of documents to retrieve
+            search_mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            search_alpha: Weight for vector scores in hybrid mode
+            use_reranking: Whether to use reranking
+        Returns:
+            Dictionary with retrieval results and metadata
+        """
+        start_time = time.time()
+        try:
+            # Set default search parameters if not provided
+            if search_mode is None:
+                search_mode = self.config.get("hybrid", {}).get("default_mode", "vector_only")
+            if search_alpha is None:
+                search_alpha = self.config.get("hybrid", {}).get("default_alpha", 0.5)
+            # Get vector store
+            vectorstore = self.vectorstore_manager.get_vectorstore()
+            if not vectorstore:
+                raise ValueError(
+                    "Vector store not available. Call connect_vectorstore() or create_vectorstore() first."
+                )
+            # Retrieve context with scores for test retrieval
+            context_docs_with_scores = self.context_retriever.retrieve_with_scores(
+                vectorstore=vectorstore,
+                query=query,
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                k=k,
+                search_mode=search_mode,
+                alpha=search_alpha,
+            )
+            # Extract documents and scores
+            context_docs = [doc for doc, score in context_docs_with_scores]
+            context_scores = [score for doc, score in context_docs_with_scores]
+            execution_time = time.time() - start_time
+            # Format results with actual scores
+            results = []
+            for i, (doc, score) in enumerate(zip(context_docs, context_scores)):
+                results.append({
+                    "rank": i + 1,
+                    "content": doc.page_content,  # Return full content without truncation
+                    "metadata": doc.metadata,
+                    "score": score if score is not None else 0.0
+                })
+            return {
+                "results": results,
+                "num_results": len(results),
+                "execution_time": execution_time,
+                "search_mode": search_mode,
+                "search_alpha": search_alpha,
+                "query": query
+            }
+        except Exception as e:
+            print(f"❌ Error during retrieval test: {e}")
+            log_error(e, {"component": "retrieval_test", "query": query})
+            return {
+                "results": [],
+                "num_results": 0,
+                "execution_time": time.time() - start_time,
+                "error": str(e),
+                "search_mode": search_mode or "unknown",
+                "search_alpha": search_alpha or 0.5,
+                "query": query
+            }
+    def connect_vectorstore(self, force_recreate: bool = False) -> bool:
+        """
+        Connect to existing vector store.
+        Args:
+            force_recreate: If True, recreate the collection if dimension mismatch occurs
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            vectorstore = self.vectorstore_manager.connect_to_existing(force_recreate=force_recreate)
+            if vectorstore:
+                print("✅ Connected to vector store")
+                return True
+            else:
+                print("❌ Failed to connect to vector store")
+                return False
+        except Exception as e:
+            print(f"❌ Error connecting to vector store: {e}")
+            log_error(e, {"component": "vectorstore_connection"})
+            # If it's a dimension mismatch error, try with force_recreate
+            if "dimensions" in str(e).lower() and not force_recreate:
+                print("🔄 Dimension mismatch detected, attempting to recreate collection...")
+                try:
+                    vectorstore = self.vectorstore_manager.connect_to_existing(force_recreate=True)
+                    if vectorstore:
+                        print("✅ Connected to vector store (recreated)")
+                        return True
+                except Exception as recreate_error:
+                    print(f"❌ Failed to recreate vector store: {recreate_error}")
+                    log_error(recreate_error, {"component": "vectorstore_recreation"})
+            return False
+    def create_vectorstore(self) -> bool:
+        """
+        Create new vector store from chunks.
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if not self.chunks:
+                raise ValueError("No chunks available for vector store creation")
+            documents = chunks_to_documents(self.chunks)
+            self.vectorstore_manager.create_from_documents(documents)
+            print("✅ Vector store created successfully")
+            return True
+        except Exception as e:
+            print(f"❌ Error creating vector store: {e}")
+            log_error(e, {"component": "vectorstore_creation"})
+            return False
+    def create_audit_prompt(self, query: str, context_docs: List[Document]) -> str:
+        """Create a prompt for the LLM to generate an answer."""
+        try:
+            # Ensure query is not None
+            if not query or not isinstance(query, str) or query.strip() == "":
+                return "Error: No query provided"
+            # Ensure context_docs is not None and is a list
+            if context_docs is None:
+                context_docs = []
+            # Filter out None documents and ensure they have content
+            valid_docs = []
+            for doc in context_docs:
+                if doc is not None:
+                    if hasattr(doc, 'page_content') and doc.page_content and isinstance(doc.page_content, str):
+                        valid_docs.append(doc)
+                    elif isinstance(doc, str) and doc.strip():
+                        valid_docs.append(doc)
+            # Create context string
+            if valid_docs:
+                context_parts = []
+                for i, doc in enumerate(valid_docs, 1):
+                    if hasattr(doc, 'page_content') and doc.page_content:
+                        context_parts.append(f"Doc {i}: {doc.page_content}")
+                    elif isinstance(doc, str) and doc.strip():
+                        context_parts.append(f"Doc {i}: {doc}")
+                context_string = "\n\n".join(context_parts)
+            else:
+                context_string = "No relevant context found."
+            # Create the prompt
+            prompt = f"""
+{self.system_prompt}
+Context:
+{context_string}
+Query: {query}
+Answer:"""
+            return prompt
+        except Exception as e:
+            print(f"Error creating audit prompt: {e}")
+            return f"Error creating prompt: {e}"
+    def _generate_answer(self, prompt: str) -> str:
+        """Generate answer using the LLM."""
+        try:
+            if not prompt or not isinstance(prompt, str) or prompt.strip() == "":
+                return "Error: No prompt provided"
+            # Ensure LLM client is available
+            if not self.llm_client:
+                return "Error: LLM client not available"
+            # Generate response using the correct method
+            if hasattr(self.llm_client, 'generate'):
+                # Use the generate method (for adapters)
+                response = self.llm_client.generate([{"role": "user", "content": prompt}])
+                # Extract content from LLMResponse
+                if hasattr(response, 'content'):
+                    answer = response.content
+                else:
+                    answer = str(response)
+            elif hasattr(self.llm_client, 'invoke'):
+                # Use the invoke method (for direct LangChain models)
+                response = self.llm_client.invoke(prompt)
+                # Extract content safely
+                if hasattr(response, 'content') and response.content is not None:
+                    answer = response.content
+                elif isinstance(response, str) and response.strip():
+                    answer = response
+                else:
+                    answer = str(response) if response is not None else "Error: LLM returned None response"
+            else:
+                return "Error: LLM client has no generate or invoke method"
+            # Ensure answer is not None and is a string
+            if answer is None or not isinstance(answer, str):
+                return "Error: LLM returned invalid response"
+            return answer.strip()
+        except Exception as e:
+            print(f"Error generating answer: {e}")
+            return f"Error generating answer: {e}"
+    def run(
+        self,
+        query: str,
+        reports: List[str] = None,
+        sources: List[str] = None,
+        subtype: List[str] = None,
+        llm_provider: str = None,
+        use_reranking: bool = True,
+        search_mode: str = None,
+        search_alpha: float = None,
+        auto_infer_filters: bool = True,
+        filters: Dict[str, Any] = None,
+    ) -> PipelineResult:
+        """
+        Run the complete RAG pipeline.
+        Args:
+            query: User query
+            reports: List of specific report filenames
+            sources: Source category filter
+            subtype: List of subtypes/filenames
+            llm_provider: LLM provider to use
+            use_reranking: Whether to use reranking
+            search_mode: Search mode (vector, sparse, hybrid)
+            search_alpha: Alpha value for hybrid search
+            auto_infer_filters: Whether to auto-infer filters from query
+        Returns:
+            PipelineResult object
+        """
+        try:
+            # Validate input
+            if not query or not isinstance(query, str) or query.strip() == "":
+                return PipelineResult(
+                    answer="Error: Invalid query provided",
+                    sources=[],
+                    execution_time=0.0,
+                    metadata={'error': 'Invalid query'},
+                    query=query
+                )
+            # Ensure lists are not None
+            if reports is None:
+                reports = []
+            if subtype is None:
+                subtype = []
+            start_time = time.time()
+            # Auto-infer filters if enabled and no explicit filters provided
+            inferred_filters = {}
+            filters_applied = False
+            qdrant_filter = None  # Add this
+            if auto_infer_filters and not any([reports, sources, subtype]):
+                print(f"🤖 AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
+                try:
+                    # Import get_available_metadata here to avoid circular imports
+                    from auditqa.retrieval.filter import get_available_metadata, infer_filters_from_query
+                    # Get available metadata
+                    available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
+                    # Infer filters from query - this returns a Qdrant filter
+                    qdrant_filter, filter_summary = infer_filters_from_query(
+                        query=query,
+                        available_metadata=available_metadata,
+                        llm_client=self.llm_client
+                    )
+                    if qdrant_filter:
+                        print(f"✅ QDRANT FILTER APPLIED: Using inferred Qdrant filter")
+                        filters_applied = True
+                        # Don't set sources/reports/subtype - use the Qdrant filter directly
+                    else:
+                        print(f"⚠️ NO QDRANT FILTER: Could not build Qdrant filter from query")
+                except Exception as e:
+                    print(f"❌ AUTO-INFERENCE FAILED: {e}")
+                    qdrant_filter = None
+            else:
+                # Check if any explicit filters were provided
+                filters_applied = any([reports, sources, subtype])
+                if filters_applied:
+                    print(f"✅ EXPLICIT FILTERS: Using provided filters")
+                else:
+                    print(f"⚠️ NO FILTERS: No explicit filters and auto-inference disabled")
+            # Extract filter parameters from the filters parameter
+            reports = filters.get('reports', []) if filters else []
+            sources = filters.get('sources', []) if filters else []
+            subtype = filters.get('subtype', []) if filters else []
+            year = filters.get('year', []) if filters else []
+            district = filters.get('district', []) if filters else []
+            filenames = filters.get('filenames', []) if filters else []  # Support mutually exclusive filename filtering
+            # Get vectorstore
+            vectorstore = self.vectorstore_manager.get_vectorstore()
+            if not vectorstore:
+                return PipelineResult(
+                    answer="Error: Vector store not available",
+                    sources=[],
+                    execution_time=0.0,
+                    metadata={'error': 'Vector store not available'},
+                    query=query
+                )
+            # Initialize context retriever if not already done
+            if not hasattr(self, 'context_retriever') or self.context_retriever is None:
+                # Get the actual vectorstore object
+                vectorstore_obj = self.vectorstore_manager.get_vectorstore()
+                if vectorstore_obj is None:
+                    print("❌ ERROR: Vectorstore is None, cannot initialize ContextRetriever")
+                    return None
+                self.context_retriever = ContextRetriever(vectorstore_obj, self.config)
+                print("✅ ContextRetriever initialized successfully")
+            # Debug config access
+            print(f" CONFIG DEBUG: Full config keys: {list(self.config.keys()) if isinstance(self.config, dict) else 'Not a dict'}")
+            print(f"🔍 CONFIG DEBUG: Retriever config: {self.config.get('retriever', {})}")
+            print(f"🔍 CONFIG DEBUG: Retrieval config: {self.config.get('retrieval', {})}")
+            print(f"🔍 CONFIG DEBUG: use_reranking from config: {self.config.get('retrieval', {}).get('use_reranking', 'NOT_FOUND')}")
+            # Get the correct top_k value
+            # Priority: experiment config > retriever config > default
+            top_k = (
+                self.config.get('retrieval', {}).get('top_k') or
+                self.config.get('retriever', {}).get('top_k') or
+                5
+            )
+            # Get reranking setting
+            use_reranking = self.config.get('retrieval', {}).get('use_reranking', False)
+            print(f"🔍 CONFIG DEBUG: Final top_k: {top_k}")
+            print(f"🔍 CONFIG DEBUG: Final use_reranking: {use_reranking}")
+            # Retrieve context using the context retriever
+            context_docs = self.context_retriever.retrieve_context(
+                query=query,
+                k=top_k,
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                year=year,
+                district=district,
+                filenames=filenames,
+                use_reranking=use_reranking,
+                qdrant_filter=qdrant_filter
+            )
+            # Ensure context_docs is not None
+            if context_docs is None:
+                context_docs = []
+            # Generate answer
+            answer = self._generate_answer(self.create_audit_prompt(query, context_docs))
+            execution_time = time.time() - start_time
+            # Create result with comprehensive metadata
+            result = PipelineResult(
+                answer=answer,
+                sources=context_docs,
+                execution_time=execution_time,
+                metadata={
+                    'llm_provider': llm_provider,
+                    'use_reranking': use_reranking,
+                    'search_mode': search_mode,
+                    'search_alpha': search_alpha,
+                    'auto_infer_filters': auto_infer_filters,
+                    'filters_applied': filters_applied,
+                    'with_filtering': filters_applied,
+                    'filter_conditions': {
+                        'reports': reports,
+                        'sources': sources,
+                        'subtype': subtype
+                    },
+                    'inferred_filters': inferred_filters,
+                    'applied_filters': {
+                        'reports': reports,
+                        'sources': sources,
+                        'subtype': subtype
+                    },
+                    # Store filter and reranking metadata
+                    'filter_details': {
+                        'explicit_filters': {
+                            'reports': reports,
+                            'sources': sources,
+                            'subtype': subtype,
+                            'year': year
+                        },
+                        'inferred_filters': inferred_filters if auto_infer_filters else {},
+                        'auto_inference_enabled': auto_infer_filters,
+                        'qdrant_filter_applied': qdrant_filter is not None,
+                        'filter_summary': filter_summary if 'filter_summary' in locals() else None
+                    },
+                    'reranker_model': self._get_reranker_model_name() if use_reranking else None,
+                    'reranker_applied': use_reranking,
+                    'reranking_info': {
+                        'model': self._get_reranker_model_name(),
+                        'applied': use_reranking,
+                        'top_k': len(context_docs) if context_docs else 0,
+                        # 'original_documents': [
+                        #     {
+                        #         'content': doc.page_content[:200] + '...' if len(doc.page_content) > 200 else doc.page_content,
+                        #         'metadata': doc.metadata,
+                        #         'score': getattr(doc, 'score', getattr(doc, 'original_score', 0.0))
+                        #     } for doc in context_docs
+                        # ] if use_reranking else None,
+                        'reranked_documents': [
+                            {
+                                'content': doc.page_content[:200] + '...' if len(doc.page_content) > 200 else doc.page_content,
+                                'metadata': doc.metadata,
+                                'score': doc.metadata.get('original_score', getattr(doc, 'score', 0.0)),
+                                'original_rank': doc.metadata.get('original_rank', None),
+                                'final_rank': doc.metadata.get('final_rank', None),
+                                'reranked_score': doc.metadata.get('reranked_score', None)
+                            } for doc in context_docs
+                        ] if use_reranking else None
+                    }
+                },
+                query=query
+            )
+            return result
+        except Exception as e:
+            print(f"Error in pipeline run: {e}")
+            return PipelineResult(
+                answer=f"Error processing query: {e}",
+                sources=[],
+                execution_time=0.0,
+                metadata={'error': str(e)},
+                query=query
+            )
+    def get_system_status(self) -> Dict[str, Any]:
+        """
+        Get system status information.
+        Returns:
+            Dictionary with system status
+        """
+        status = {
+            "config_loaded": bool(self.config),
+            "chunks_loaded": bool(self.chunks),
+            "vectorstore_connected": bool(
+                self.vectorstore_manager and self.vectorstore_manager.get_vectorstore()
+            ),
+            "components_initialized": bool(
+                self.context_retriever and self.report_service
+            ),
+        }
+        if self.chunks:
+            status["num_chunks"] = len(self.chunks)
+        if self.report_service:
+            status["available_sources"] = self.report_service.get_available_sources()
+            status["available_reports"] = len(
+                self.report_service.get_available_reports()
+            )
+        status["overall_status"] = (
+            "ready"
+            if all(
+                [
+                    status["config_loaded"],
+                    status["chunks_loaded"],
+                    status["vectorstore_connected"],
+                    status["components_initialized"],
+                ]
+            )
+            else "not_ready"
+        )
+        return status
+    def get_available_llm_providers(self) -> List[str]:
+        """Get list of available LLM providers."""
+        providers = []
+        reader_config = self.config.get("reader", {})
+        for provider in [
+            "MISTRAL",
+            "OPENAI",
+            "OLLAMA",
+            "INF_PROVIDERS",
+            "NVIDIA",
+            "DEDICATED",
+            "OPENROUTER",
+        ]:
+            if provider in reader_config:
+                providers.append(provider.lower())
+        return providers

src/reporting/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Report metadata and utilities."""
+from .metadata import get_report_metadata, get_available_sources
+from .service import ReportService
+__all__ = ["get_report_metadata", "get_available_sources", "ReportService"]

src/reporting/feedback_schema.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Feedback Schema for RAG Chatbot
+This module defines dataclasses for feedback data structures
+and provides Snowflake schema generation.
+"""
+from dataclasses import dataclass, asdict, field
+from typing import List, Optional, Dict, Any, Union
+from datetime import datetime
+@dataclass
+class RetrievedDocument:
+    """Single retrieved document metadata"""
+    doc_id: str
+    filename: str
+    page: int
+    score: float
+    content: str
+    metadata: Dict[str, Any]
+@dataclass
+class RetrievalEntry:
+    """Single retrieval operation metadata"""
+    rag_query: str
+    documents_retrieved: List[RetrievedDocument]
+    conversation_length: int
+    filters_applied: Optional[Dict[str, Any]] = None
+    timestamp: Optional[float] = None
+    _raw_data: Optional[Dict[str, Any]] = None
+@dataclass
+class UserFeedback:
+    """User feedback submission data"""
+    feedback_id: str
+    open_ended_feedback: Optional[str]
+    score: int
+    is_feedback_about_last_retrieval: bool
+    retrieved_data: List[RetrievalEntry]
+    conversation_id: str
+    timestamp: float
+    message_count: int
+    has_retrievals: bool
+    retrieval_count: int
+    user_query: Optional[str] = None
+    bot_response: Optional[str] = None
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary with nested data structures"""
+        result = asdict(self)
+        # Handle nested objects
+        if self.retrieved_data:
+            result['retrieved_data'] = [self._serialize_retrieval_entry(entry) for entry in self.retrieved_data]
+        return result
+    def _serialize_retrieval_entry(self, entry: RetrievalEntry) -> Dict[str, Any]:
+        """Serialize retrieval entry to dict"""
+        # If raw data exists, use it (it's already properly formatted)
+        if hasattr(entry, '_raw_data') and entry._raw_data:
+            return entry._raw_data
+        # Otherwise, serialize the dataclass
+        result = asdict(entry)
+        if entry.documents_retrieved:
+            result['documents_retrieved'] = [asdict(doc) for doc in entry.documents_retrieved]
+        return result
+    def to_snowflake_schema(self) -> Dict[str, Any]:
+        """Generate Snowflake schema for this dataclass"""
+        schema = {
+            "feedback_id": "VARCHAR(255)",
+            "open_ended_feedback": "VARCHAR(16777216)",  # Large text
+            "score": "INTEGER",
+            "is_feedback_about_last_retrieval": "BOOLEAN",
+            "conversation_id": "VARCHAR(255)",
+            "timestamp": "NUMBER(20, 0)",
+            "message_count": "INTEGER",
+            "has_retrievals": "BOOLEAN",
+            "retrieval_count": "INTEGER",
+            "user_query": "VARCHAR(16777216)",
+            "bot_response": "VARCHAR(16777216)",
+            "created_at": "TIMESTAMP_NTZ",
+            "retrieved_data": "VARIANT",  # Array of retrieval entries
+            # retrieved_data structure:
+            # [
+            #   {
+            #     "rag_query": "...",
+            #     "conversation_length": 5,
+            #     "timestamp": 1234567890,
+            #     "docs_retrieved": [
+            #       {"filename": "...", "page": 14, "score": 0.95, ...},
+            #       ...
+            #     ]
+            #   },
+            #   ...
+            # ]
+        }
+        return schema
+    @classmethod
+    def get_snowflake_create_table_sql(cls, table_name: str = "user_feedback") -> str:
+        """Generate CREATE TABLE SQL for Snowflake"""
+        schema = cls.to_snowflake_schema(None)
+        columns = []
+        for col_name, col_type in schema.items():
+            nullable = "NULL" if col_name not in ["feedback_id", "score", "timestamp"] else "NOT NULL"
+            columns.append(f"  {col_name} {col_type} {nullable}")
+        # Build SQL string properly
+        columns_str = ",\n".join(columns)
+        sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
+{columns_str},
+  PRIMARY KEY (feedback_id)
+);
+-- Create index on timestamp for querying by time
+CREATE INDEX IF NOT EXISTS idx_feedback_timestamp ON {table_name} (timestamp);
+-- Create index on conversation_id for querying by conversation
+CREATE INDEX IF NOT EXISTS idx_feedback_conversation ON {table_name} (conversation_id);
+-- Create index on score for feedback analysis
+CREATE INDEX IF NOT EXISTS idx_feedback_score ON {table_name} (score);
+"""
+        return sql
+# Snowflake variant schema for retrieved_data array
+RETRIEVAL_ENTRY_SCHEMA = {
+    "rag_query": "VARCHAR",
+    "documents_retrieved": "ARRAY",  # Array of document objects
+    "conversation_length": "INTEGER",
+    "filters_applied": "OBJECT",
+    "timestamp": "NUMBER"
+}
+DOCUMENT_SCHEMA = {
+    "doc_id": "VARCHAR",
+    "filename": "VARCHAR",
+    "page": "INTEGER",
+    "score": "DOUBLE",
+    "content": "VARCHAR(16777216)",
+    "metadata": "OBJECT"
+}
+def generate_snowflake_schema_sql() -> str:
+    """Generate complete Snowflake schema SQL for feedback system"""
+    return UserFeedback.get_snowflake_create_table_sql("user_feedback")
+def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
+    """Create UserFeedback instance from dictionary"""
+    # Parse retrieved_data if present
+    retrieved_data = []
+    if "retrieved_data" in data and data["retrieved_data"]:
+        for entry_dict in data.get("retrieved_data", []):
+            # Map the actual structure from rag_retrieval_history
+            # Entry has: conversation_up_to, rag_query_expansion, docs_retrieved
+            try:
+                # Try to map to expected structure
+                entry = RetrievalEntry(
+                    rag_query=entry_dict.get("rag_query_expansion", ""),
+                    documents_retrieved=[],  # Empty for now, will store as raw data
+                    conversation_length=len(entry_dict.get("conversation_up_to", [])),
+                    filters_applied=None,
+                    timestamp=entry_dict.get("timestamp", None)
+                )
+                # Store raw data in the entry
+                entry._raw_data = entry_dict  # Store original for preservation
+                retrieved_data.append(entry)
+            except Exception as e:
+                # If mapping fails, store as-is without strict typing
+                pass
+    return UserFeedback(
+        feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
+        open_ended_feedback=data.get("open_ended_feedback"),
+        score=data["score"],
+        is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
+        retrieved_data=retrieved_data,
+        conversation_id=data["conversation_id"],
+        timestamp=data["timestamp"],
+        message_count=data["message_count"],
+        has_retrievals=data["has_retrievals"],
+        retrieval_count=data["retrieval_count"],
+        user_query=data.get("user_query"),
+        bot_response=data.get("bot_response")
+    )

src/reporting/metadata.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""Report metadata management."""
+from typing import Dict, List, Any, Set
+from pathlib import Path
+def get_report_metadata(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Extract metadata from chunks.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        Dictionary with report metadata
+    """
+    if not chunks:
+        return {}
+    sources = set()
+    filenames = set()
+    years = set()
+    for chunk in chunks:
+        metadata = chunk.get("metadata", {})
+        if "source" in metadata:
+            sources.add(metadata["source"])
+        if "filename" in metadata:
+            filenames.add(metadata["filename"])
+        if "year" in metadata:
+            years.add(metadata["year"])
+    return {
+        "sources": sorted(list(sources)),
+        "filenames": sorted(list(filenames)),
+        "years": sorted(list(years)),
+        "total_chunks": len(chunks)
+    }
+def get_available_sources() -> List[str]:
+    """
+    Get list of available report sources (legacy compatibility).
+    Returns:
+        List of source categories
+    """
+    # This would typically come from the original auditqa_old.reports module
+    # For now, return common categories
+    return [
+        "Consolidated",
+        "Ministry, Department, Agency and Projects",
+        "Local Government",
+        "Value for Money",
+        "Thematic",
+        "Hospital",
+        "Project"
+    ]
+def get_source_subtypes() -> Dict[str, List[str]]:
+    """
+    Get mapping of sources to their subtypes (placeholder).
+    Returns:
+        Dictionary mapping sources to subtypes
+    """
+    # This was originally imported from auditqa_old.reports.new_files
+    # For now, return a placeholder structure
+    return {
+        "Consolidated": ["Annual Consolidated OAG 2024", "Annual Consolidated OAG 2023"],
+        "Local Government": ["District Reports", "Municipal Reports"],
+        "Ministry, Department, Agency and Projects": ["Ministry Reports", "Agency Reports"],
+        "Value for Money": ["VFM Reports 2024", "VFM Reports 2023"],
+        "Thematic": ["Thematic Reports 2024", "Thematic Reports 2023"],
+        "Hospital": ["Hospital Reports 2024", "Hospital Reports 2023"],
+        "Project": ["Project Reports 2024", "Project Reports 2023"]
+    }
+def validate_report_filters(
+    reports: List[str] = None,
+    sources: str = None,
+    subtype: List[str] = None,
+    available_metadata: Dict[str, Any] = None
+) -> Dict[str, Any]:
+    """
+    Validate report filter parameters.
+    Args:
+        reports: List of specific report filenames
+        sources: Source category
+        subtype: List of subtypes
+        available_metadata: Available metadata for validation
+    Returns:
+        Dictionary with validation results
+    """
+    validation_result = {
+        "valid": True,
+        "warnings": [],
+        "errors": []
+    }
+    if not available_metadata:
+        validation_result["warnings"].append("No metadata available for validation")
+        return validation_result
+    available_sources = available_metadata.get("sources", [])
+    available_filenames = available_metadata.get("filenames", [])
+    # Validate sources
+    if sources and sources not in available_sources:
+        validation_result["errors"].append(f"Source '{sources}' not found in available sources")
+        validation_result["valid"] = False
+    # Validate reports
+    if reports:
+        for report in reports:
+            if report not in available_filenames:
+                validation_result["warnings"].append(f"Report '{report}' not found in available reports")
+    # Validate subtypes
+    if subtype:
+        for sub in subtype:
+            if sub not in available_filenames:
+                validation_result["warnings"].append(f"Subtype '{sub}' not found in available reports")
+    return validation_result
+def get_report_statistics(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Get statistics about reports in chunks.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        Dictionary with report statistics
+    """
+    if not chunks:
+        return {}
+    stats = {
+        "total_chunks": len(chunks),
+        "sources": {},
+        "years": {},
+        "avg_chunk_length": 0,
+        "total_content_length": 0
+    }
+    total_length = 0
+    for chunk in chunks:
+        content = chunk.get("content", "")
+        total_length += len(content)
+        metadata = chunk.get("metadata", {})
+        # Count by source
+        source = metadata.get("source", "Unknown")
+        stats["sources"][source] = stats["sources"].get(source, 0) + 1
+        # Count by year
+        year = metadata.get("year", "Unknown")
+        stats["years"][year] = stats["years"].get(year, 0) + 1
+    stats["total_content_length"] = total_length
+    stats["avg_chunk_length"] = total_length / len(chunks) if chunks else 0
+    return stats
+def filter_chunks_by_metadata(
+    chunks: List[Dict[str, Any]],
+    source_filter: str = None,
+    filename_filter: List[str] = None,
+    year_filter: List[str] = None
+) -> List[Dict[str, Any]]:
+    """
+    Filter chunks by metadata criteria.
+    Args:
+        chunks: List of chunk dictionaries
+        source_filter: Source to filter by
+        filename_filter: List of filenames to filter by
+        year_filter: List of years to filter by
+    Returns:
+        Filtered list of chunks
+    """
+    filtered_chunks = chunks
+    if source_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("source") == source_filter
+        ]
+    if filename_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("filename") in filename_filter
+        ]
+    if year_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("year") in year_filter
+        ]
+    return filtered_chunks

src/reporting/service.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Report service for managing report operations."""
+from typing import Dict, List, Any, Optional
+from .metadata import get_report_metadata, get_available_sources, get_source_subtypes
+class ReportService:
+    """Service class for report operations."""
+    def __init__(self, chunks: List[Dict[str, Any]] = None):
+        """
+        Initialize report service.
+        Args:
+            chunks: List of chunk dictionaries
+        """
+        self.chunks = chunks or []
+        self.metadata = get_report_metadata(self.chunks) if self.chunks else {}
+    def get_available_sources(self) -> List[str]:
+        """Get available report sources."""
+        if self.metadata:
+            return self.metadata.get("sources", [])
+        return get_available_sources()
+    def get_available_reports(self) -> List[str]:
+        """Get available report filenames."""
+        return self.metadata.get("filenames", [])
+    def get_source_subtypes(self) -> Dict[str, List[str]]:
+        """Get source to subtype mapping."""
+        # For now, use the placeholder function
+        # In a full implementation, this would be derived from actual data
+        return get_source_subtypes()
+    def get_reports_by_source(self, source: str) -> List[str]:
+        """
+        Get reports filtered by source.
+        Args:
+            source: Source category
+        Returns:
+            List of report filenames
+        """
+        if not self.chunks:
+            return []
+        reports = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("source") == source:
+                filename = metadata.get("filename")
+                if filename:
+                    reports.add(filename)
+        return sorted(list(reports))
+    def get_years_by_source(self, source: str) -> List[str]:
+        """
+        Get years available for a specific source.
+        Args:
+            source: Source category
+        Returns:
+            List of years
+        """
+        if not self.chunks:
+            return []
+        years = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("source") == source:
+                year = metadata.get("year")
+                if year:
+                    years.add(year)
+        return sorted(list(years))
+    def search_reports(self, query: str) -> List[str]:
+        """
+        Search for reports by name.
+        Args:
+            query: Search query
+        Returns:
+            List of matching report filenames
+        """
+        if not self.chunks:
+            return []
+        query_lower = query.lower()
+        matching_reports = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            filename = metadata.get("filename", "")
+            if query_lower in filename.lower():
+                matching_reports.add(filename)
+        return sorted(list(matching_reports))
+    def get_report_info(self, filename: str) -> Dict[str, Any]:
+        """
+        Get information about a specific report.
+        Args:
+            filename: Report filename
+        Returns:
+            Dictionary with report information
+        """
+        if not self.chunks:
+            return {}
+        report_info = {
+            "filename": filename,
+            "chunk_count": 0,
+            "sources": set(),
+            "years": set(),
+            "total_content_length": 0
+        }
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("filename") == filename:
+                report_info["chunk_count"] += 1
+                report_info["total_content_length"] += len(chunk.get("content", ""))
+                if "source" in metadata:
+                    report_info["sources"].add(metadata["source"])
+                if "year" in metadata:
+                    report_info["years"].add(metadata["year"])
+        # Convert sets to lists
+        report_info["sources"] = list(report_info["sources"])
+        report_info["years"] = list(report_info["years"])
+        return report_info

src/reporting/snowflake_connector.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+Snowflake Connector for Feedback System
+This module handles inserting user feedback into Snowflake.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, Optional
+from src.reporting.feedback_schema import UserFeedback
+# Try to import snowflake connector
+try:
+    import snowflake.connector
+    SNOWFLAKE_AVAILABLE = True
+except ImportError:
+    SNOWFLAKE_AVAILABLE = False
+    logging.warning("⚠️ snowflake-connector-python not installed. Install with: pip install snowflake-connector-python")
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class SnowflakeFeedbackConnector:
+    """Connector for inserting feedback into Snowflake"""
+    def __init__(
+        self,
+        user: str,
+        password: str,
+        account: str,
+        warehouse: str,
+        database: str = "SNOWFLAKE_LEARNING",
+        schema: str = "PUBLIC"
+    ):
+        self.user = user
+        self.password = password
+        self.account = account
+        self.warehouse = warehouse
+        self.database = database
+        self.schema = schema
+        self._connection = None
+    def connect(self):
+        """Establish Snowflake connection"""
+        if not SNOWFLAKE_AVAILABLE:
+            raise ImportError("snowflake-connector-python is not installed. Install with: pip install snowflake-connector-python")
+        logger.info("=" * 80)
+        logger.info("🔌 SNOWFLAKE CONNECTION: Attempting to connect...")
+        logger.info(f"   - Account: {self.account}")
+        logger.info(f"   - Warehouse: {self.warehouse}")
+        logger.info(f"   - Database: {self.database}")
+        logger.info(f"   - Schema: {self.schema}")
+        logger.info(f"   - User: {self.user}")
+        try:
+            self._connection = snowflake.connector.connect(
+                user=self.user,
+                password=self.password,
+                account=self.account,
+                warehouse=self.warehouse
+                # Don't set database/schema in connection - we'll do it per query
+            )
+            logger.info("✅ SNOWFLAKE CONNECTION: Successfully connected")
+            logger.info("=" * 80)
+            print(f"✅ Connected to Snowflake: {self.database}.{self.schema}")
+        except Exception as e:
+            logger.error(f"❌ SNOWFLAKE CONNECTION FAILED: {e}")
+            logger.error("=" * 80)
+            print(f"❌ Failed to connect to Snowflake: {e}")
+            raise
+    def disconnect(self):
+        """Close Snowflake connection"""
+        if self._connection:
+            self._connection.close()
+            print("✅ Disconnected from Snowflake")
+    def insert_feedback(self, feedback: UserFeedback) -> bool:
+        """Insert a single feedback record into Snowflake"""
+        logger.info("=" * 80)
+        logger.info("🔄 SNOWFLAKE INSERT: Starting feedback insertion process")
+        logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+        if not self._connection:
+            logger.error("❌ Not connected to Snowflake. Call connect() first.")
+            raise RuntimeError("Not connected to Snowflake. Call connect() first.")
+        try:
+            logger.info("📊 VALIDATION: Validating feedback data structure...")
+            # Validate feedback object
+            validation_errors = []
+            if not feedback.feedback_id:
+                validation_errors.append("Missing feedback_id")
+            if feedback.score is None:
+                validation_errors.append("Missing score")
+            if feedback.timestamp is None:
+                validation_errors.append("Missing timestamp")
+            if validation_errors:
+                logger.error(f"❌ VALIDATION FAILED: {validation_errors}")
+                return False
+            else:
+                logger.info("✅ VALIDATION PASSED: All required fields present")
+            logger.info("📋 Data Summary:")
+            logger.info(f"   - Feedback ID: {feedback.feedback_id}")
+            logger.info(f"   - Score: {feedback.score}")
+            logger.info(f"   - Conversation ID: {feedback.conversation_id}")
+            logger.info(f"   - Has Retrievals: {feedback.has_retrievals}")
+            logger.info(f"   - Retrieval Count: {feedback.retrieval_count}")
+            logger.info(f"   - Message Count: {feedback.message_count}")
+            logger.info(f"   - Timestamp: {feedback.timestamp}")
+            cursor = self._connection.cursor()
+            logger.info("✅ SNOWFLAKE CONNECTION: Cursor created")
+            # Set database and schema context
+            logger.info(f"🔧 SETTING CONTEXT: Database={self.database}, Schema={self.schema}")
+            try:
+                cursor.execute(f'USE DATABASE "{self.database}"')
+                cursor.execute(f'USE SCHEMA "{self.schema}"')
+                cursor.execute("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA()")
+                current_db, current_schema = cursor.fetchone()
+                logger.info(f"✅ Current context verified: Database={current_db}, Schema={current_schema}")
+            except Exception as e:
+                logger.error(f"❌ Could not set context: {e}")
+                raise
+            # Prepare data
+            logger.info("🔧 DATA PREPARATION: Preparing retrieved_data...")
+            retrieved_data_raw = feedback.to_dict()['retrieved_data']
+            logger.info(f"   - Retrieved data type (raw): {type(retrieved_data_raw).__name__}")
+            logger.info(f"   - Retrieved data: {repr(retrieved_data_raw)[:200]}")
+            # If retrieved_data is already a string (from UI), parse it
+            if isinstance(retrieved_data_raw, str):
+                logger.info("   - Parsing string to Python object")
+                retrieved_data = json.loads(retrieved_data_raw)
+            elif retrieved_data_raw is None:
+                retrieved_data = None
+            else:
+                # It's already a Python object (list/dict)
+                logger.info("   - Data is already a Python object")
+                retrieved_data = retrieved_data_raw
+            logger.info(f"   - Retrieved data size: {len(str(retrieved_data)) if retrieved_data else 0} characters")
+            logger.info(f"   - Retrieved data type: {type(retrieved_data).__name__}")
+            # Convert to JSON string for TEXT column
+            if retrieved_data:
+                retrieved_data_for_db = json.dumps(retrieved_data)
+                logger.info(f"   - Converting to JSON string for TEXT column")
+                logger.info(f"   - JSON string length: {len(retrieved_data_for_db)}")
+            else:
+                logger.info(f"   - Retrieved data is None, using NULL")
+                retrieved_data_for_db = None
+            # Build SQL with retrieved_data as a TEXT column parameter
+            sql = f"""INSERT INTO user_feedback (
+                feedback_id,
+                open_ended_feedback,
+                score,
+                is_feedback_about_last_retrieval,
+                conversation_id,
+                timestamp,
+                message_count,
+                has_retrievals,
+                retrieval_count,
+                user_query,
+                bot_response,
+                created_at,
+                retrieved_data
+            ) VALUES (
+                %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
+                %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
+                %(retrieval_count)s, %(user_query)s, %(bot_response)s, %(created_at)s,
+                %(retrieved_data)s
+            )"""
+            logger.info("📝 SQL PREPARATION: Building INSERT statement...")
+            logger.info(f"   - Target table: user_feedback")
+            logger.info(f"   - Database: {self.database}")
+            logger.info(f"   - Schema: {self.schema}")
+            # Prepare parameters
+            params = {
+                'feedback_id': feedback.feedback_id,
+                'open_ended_feedback': feedback.open_ended_feedback,
+                'score': feedback.score,
+                'is_feedback_about_last_retrieval': feedback.is_feedback_about_last_retrieval,
+                'conversation_id': feedback.conversation_id,
+                'timestamp': int(feedback.timestamp),
+                'message_count': feedback.message_count,
+                'has_retrievals': feedback.has_retrievals,
+                'retrieval_count': feedback.retrieval_count,
+                'user_query': feedback.user_query,
+                'bot_response': feedback.bot_response,
+                'created_at': feedback.created_at,
+                'retrieved_data': retrieved_data_for_db
+            }
+            # Execute insert
+            logger.info("🚀 SQL EXECUTION: Executing INSERT query...")
+            cursor.execute(sql, params)
+            logger.info("✅ SQL EXECUTION: Query executed successfully")
+            logger.info(f"   - Rows affected: 1")
+            logger.info(f"   - Status: SUCCESS")
+            cursor.close()
+            logger.info("✅ SNOWFLAKE INSERT: Feedback inserted successfully")
+            logger.info(f"📝 Inserted feedback: {feedback.feedback_id}")
+            logger.info("=" * 80)
+            return True
+        except Exception as e:
+            # Check if it's a Snowflake error
+            if SNOWFLAKE_AVAILABLE and "ProgrammingError" in str(type(e)):
+                logger.error(f"❌ SQL EXECUTION ERROR: {e}")
+                logger.error(f"   - Error code: {getattr(e, 'errno', 'Unknown')}")
+                logger.error(f"   - SQL state: {getattr(e, 'sqlstate', 'Unknown')}")
+            else:
+                logger.error(f"❌ SNOWFLAKE INSERT FAILED: {type(e).__name__}")
+                logger.error(f"   - Error: {e}")
+            logger.error("=" * 80)
+            return False
+    def __enter__(self):
+        """Context manager entry"""
+        self.connect()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.disconnect()
+def get_snowflake_connector_from_env() -> Optional[SnowflakeFeedbackConnector]:
+    """Create Snowflake connector from environment variables"""
+    user = os.getenv("SNOWFLAKE_USER")
+    password = os.getenv("SNOWFLAKE_PASSWORD")
+    account = os.getenv("SNOWFLAKE_ACCOUNT")
+    warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
+    database = os.getenv("SNOWFLAKE_DATABASE", "SNOWFLAKE_LEARN")
+    schema = os.getenv("SNOWFLAKE_SCHEMA", "PUBLIC")
+    if not all([user, password, account, warehouse]):
+        print("⚠️ Snowflake credentials not found in environment variables")
+        print("Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        return None
+    return SnowflakeFeedbackConnector(
+        user=user,
+        password=password,
+        account=account,
+        warehouse=warehouse,
+        database=database,
+        schema=schema
+    )
+def save_to_snowflake(feedback: UserFeedback) -> bool:
+    """Helper function to save feedback to Snowflake"""
+    logger.info("=" * 80)
+    logger.info("🔵 SNOWFLAKE SAVE: Starting save process")
+    logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+    connector = get_snowflake_connector_from_env()
+    if not connector:
+        logger.warning("⚠️ SNOWFLAKE SAVE: Skipping insertion (credentials not configured)")
+        logger.warning("   Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        logger.info("=" * 80)
+        return False
+    try:
+        logger.info("📡 SNOWFLAKE SAVE: Establishing connection...")
+        connector.connect()
+        logger.info("✅ SNOWFLAKE SAVE: Connection established")
+        logger.info("📥 SNOWFLAKE SAVE: Attempting to insert feedback...")
+        success = connector.insert_feedback(feedback)
+        logger.info("🔌 SNOWFLAKE SAVE: Disconnecting...")
+        connector.disconnect()
+        if success:
+            logger.info("✅ SNOWFLAKE SAVE: Successfully saved feedback")
+        else:
+            logger.error("❌ SNOWFLAKE SAVE: Failed to save feedback")
+        logger.info("=" * 80)
+        return success
+    except Exception as e:
+        logger.error(f"❌ SNOWFLAKE SAVE ERROR: {type(e).__name__}")
+        logger.error(f"   - Error: {e}")
+        logger.info("=" * 80)
+        return False

src/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Document retrieval and filtering utilities."""
+from .filter import create_filter, FilterBuilder
+from .context import ContextRetriever, get_context
+from .hybrid import HybridRetriever, get_available_search_modes, get_search_mode_description
+__all__ = [
+    "create_filter",
+    "FilterBuilder",
+    "ContextRetriever",
+    "get_context",
+    "HybridRetriever",
+    "get_available_search_modes",
+    "get_search_mode_description"
+]

src/retrieval/colbert_cache.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+ColBERT embeddings cache for test set documents.
+Provides O(1) lookup for ColBERT embeddings during late interaction.
+"""
+import json
+import numpy as np
+from pathlib import Path
+from typing import Dict, Optional, Any
+class ColBERTCache:
+    """Cache for ColBERT embeddings of test set documents."""
+    def __init__(self, cache_file: str = "test_set_colbert_cache.json"):
+        self.cache_file = Path("outputs/caches") / cache_file
+        self.embeddings_cache: Dict[str, np.ndarray] = {}
+        self._load_cache()
+    def _load_cache(self):
+        """Load embeddings from cache file."""
+        if not self.cache_file.exists():
+            print(f"⚠️ ColBERT cache not found: {self.cache_file}")
+            print("💡 Run 'python precalculate_test_set_colbert.py' to create cache")
+            return
+        print(f"📂 Loading ColBERT cache from {self.cache_file}...")
+        try:
+            with open(self.cache_file, 'r') as f:
+                cache_data = json.load(f)
+            # Reconstruct embeddings from compressed format
+            for doc_id, data in cache_data.items():
+                embedding_min = data['min']
+                embedding_max = data['max']
+                quantized_embedding = np.array(data['embedding'], dtype=np.uint8)
+                # Reconstruct original embedding
+                reconstructed = (quantized_embedding.astype(np.float32) / 255.0) * (embedding_max - embedding_min) + embedding_min
+                self.embeddings_cache[doc_id] = reconstructed.reshape(data['shape'])
+            print(f"✅ Loaded {len(self.embeddings_cache)} ColBERT embeddings from cache")
+        except Exception as e:
+            print(f"❌ Error loading ColBERT cache: {e}")
+            self.embeddings_cache = {}
+    def get_embedding(self, document_text: str) -> Optional[np.ndarray]:
+        """Get ColBERT embedding for a document (O(1) lookup)."""
+        return self.embeddings_cache.get(document_text)
+    def has_embedding(self, document_text: str) -> bool:
+        """Check if embedding exists for document."""
+        return document_text in self.embeddings_cache
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        return {
+            'total_embeddings': len(self.embeddings_cache),
+            'cache_file': str(self.cache_file),
+            'cache_exists': self.cache_file.exists()
+        }
+# Global cache instance
+_colbert_cache = None
+def get_colbert_cache() -> ColBERTCache:
+    """Get global ColBERT cache instance."""
+    global _colbert_cache
+    if _colbert_cache is None:
+        _colbert_cache = ColBERTCache()
+    return _colbert_cache

src/retrieval/context.py ADDED Viewed

	@@ -0,0 +1,881 @@

+"""Context retrieval with reranking capabilities."""
+import os
+from typing import List, Optional, Tuple, Dict, Any
+from langchain.schema import Document
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from sentence_transformers import CrossEncoder
+import numpy as np
+import torch
+from qdrant_client.http import models as rest
+import traceback
+from .filter import create_filter
+class ContextRetriever:
+    """
+    Context retriever for hybrid search with optional filtering and reranking.
+    """
+    def __init__(self, vectorstore: Qdrant, config: dict = None):
+        """
+        Initialize the context retriever.
+        Args:
+            vectorstore: Qdrant vector store instance
+            config: Configuration dictionary
+        """
+        self.vectorstore = vectorstore
+        self.config = config or {}
+        self.reranker = None
+        # BM25 attributes
+        self.bm25_vectorizer = None
+        self.bm25_matrix = None
+        self.bm25_documents = None
+        # Initialize reranker if available
+        # Try to get reranker model from different config paths
+        self.reranker_model_name = (
+            config.get('retrieval', {}).get('reranker_model') or
+            config.get('ranker', {}).get('model') or
+            config.get('reranker_model') or
+            'BAAI/bge-reranker-v2-m3'
+        )
+        self.reranker_type = self._detect_reranker_type(self.reranker_model_name)
+        try:
+            if self.reranker_type == 'colbert':
+                from colbert.infra import Run, ColBERTConfig
+                from colbert.modeling.checkpoint import Checkpoint
+                # ColBERT uses late interaction - different implementation needed
+                print(f"✅ RERANKER: ColBERT model detected ({self.reranker_model_name})")
+                print(f"🔍 INTERACTION TYPE: Late interaction (token-level embeddings)")
+                # Create ColBERT config for CPU mode
+                colbert_config = ColBERTConfig(
+                    doc_maxlen=300,
+                    query_maxlen=32,
+                    nbits=2,
+                    kmeans_niters=4,
+                    root="./colbert_data"
+                )
+                # Load checkpoint (e.g. "colbert-ir/colbertv2.0")
+                self.colbert_checkpoint = Checkpoint(self.reranker_model_name, colbert_config=colbert_config)
+                self.colbert_model = self.colbert_checkpoint.model
+                self.colbert_tokenizer = self.colbert_checkpoint.raw_tokenizer
+                self.reranker = self._colbert_rerank  # attach wrapper function
+                print(f"✅ COLBERT: Model and tokenizer loaded successfully")
+            else:
+                # Standard CrossEncoder for BGE and other models
+                from sentence_transformers import CrossEncoder
+                self.reranker = CrossEncoder(self.reranker_model_name)
+                print(f"✅ RERANKER: Initialized {self.reranker_model_name}")
+                print(f"🔍 INTERACTION TYPE: Cross-encoder (single relevance score)")
+        except Exception as e:
+            print(f"⚠️ Reranker initialization failed: {e}")
+            self.reranker = None
+    def _detect_reranker_type(self, model_name: str) -> str:
+        """
+        Detect the type of reranker based on model name.
+        Args:
+            model_name: Name of the reranker model
+        Returns:
+            'colbert' for ColBERT models, 'crossencoder' for others
+        """
+        model_name_lower = model_name.lower()
+        # ColBERT model patterns
+        colbert_patterns = [
+            'colbert',
+            'colbert-ir',
+            'colbertv2',
+            'colbert-v2'
+        ]
+        for pattern in colbert_patterns:
+            if pattern in model_name_lower:
+                return 'colbert'
+        # Default to cross-encoder for BGE and other models
+        return 'crossencoder'
+    def _similarity_search_with_colbert_embeddings(self, query: str, k: int = 5, **kwargs) -> List[Tuple[Document, float]]:
+        """
+        Perform similarity search and fetch ColBERT embeddings for documents.
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            **kwargs: Additional search parameters (filter, etc.)
+        Returns:
+            List of (Document, score) tuples with ColBERT embeddings in metadata
+        """
+        try:
+            print(f"🔍 COLBERT RETRIEVAL: Fetching documents with ColBERT embeddings")
+            # Use the vectorstore's similarity_search_with_score method instead of direct client
+            # This ensures proper filter handling
+            if 'filter' in kwargs and kwargs['filter']:
+                # Use the vectorstore method with filter
+                result = self.vectorstore.similarity_search_with_score(
+                    query,
+                    k=k,
+                    filter=kwargs['filter']
+                )
+            else:
+                # Use the vectorstore method without filter
+                result = self.vectorstore.similarity_search_with_score(query, k=k)
+            # Convert to the format we need
+            if isinstance(result, tuple) and len(result) == 2:
+                documents, scores = result
+            elif isinstance(result, list):
+                documents = []
+                scores = []
+                for item in result:
+                    if isinstance(item, tuple) and len(item) == 2:
+                        doc, score = item
+                        documents.append(doc)
+                        scores.append(score)
+                    else:
+                        documents.append(item)
+                        scores.append(0.0)
+            else:
+                documents = []
+                scores = []
+            # Now we need to fetch the ColBERT embeddings for these documents
+            # We'll use the Qdrant client directly for this part since we need specific payload fields
+            from qdrant_client.http import models as rest
+            collection_name = self.vectorstore.collection_name
+            # Get document IDs from the retrieved documents
+            doc_ids = []
+            for doc in documents:
+                # Extract ID from document metadata or use page_content hash as fallback
+                doc_id = doc.metadata.get('id') or doc.metadata.get('_id')
+                if not doc_id:
+                    # Use a hash of the content as ID
+                    import hashlib
+                    doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
+                doc_ids.append(doc_id)
+            # Fetch documents with ColBERT embeddings from Qdrant
+            search_result = self.vectorstore.client.retrieve(
+                collection_name=collection_name,
+                ids=doc_ids,
+                with_payload=True,
+                with_vectors=False
+            )
+            # Convert results to Document objects with ColBERT embeddings
+            enhanced_documents = []
+            enhanced_scores = []
+            # Create a mapping from doc_id to original score
+            doc_id_to_score = {}
+            for i, doc in enumerate(documents):
+                doc_id = doc.metadata.get('id') or doc.metadata.get('_id')
+                if not doc_id:
+                    import hashlib
+                    doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
+                doc_id_to_score[doc_id] = scores[i]
+            for point in search_result:
+                # Extract payload
+                payload = point.payload
+                # Get the original score for this document
+                doc_id = str(point.id)
+                original_score = doc_id_to_score.get(doc_id, 0.0)
+                # Create Document object with ColBERT embeddings
+                doc = Document(
+                    page_content=payload.get('page_content', ''),
+                    metadata={
+                        **payload.get('metadata', {}),
+                        'colbert_embedding': payload.get('colbert_embedding'),
+                        'colbert_model': payload.get('colbert_model'),
+                        'colbert_calculated_at': payload.get('colbert_calculated_at')
+                    }
+                )
+                enhanced_documents.append(doc)
+                enhanced_scores.append(original_score)
+            print(f"✅ COLBERT RETRIEVAL: Retrieved {len(enhanced_documents)} documents with ColBERT embeddings")
+            return list(zip(enhanced_documents, enhanced_scores))
+        except Exception as e:
+            print(f"❌ COLBERT RETRIEVAL ERROR: {e}")
+            print(f"❌ Falling back to regular similarity search")
+            # Fallback to regular search - handle filter parameter correctly
+            if 'filter' in kwargs and kwargs['filter']:
+                return self.vectorstore.similarity_search_with_score(query, k=k, filter=kwargs['filter'])
+            else:
+                return self.vectorstore.similarity_search_with_score(query, k=k)
+    def retrieve_context(
+        self,
+        query: str,
+        k: int = 5,
+        reports: Optional[List[str]] = None,
+        sources: Optional[List[str]] = None,
+        subtype: Optional[str] = None,
+        year: Optional[str] = None,
+        district: Optional[List[str]] = None,
+        filenames: Optional[List[str]] = None,
+        use_reranking: bool = False,
+        qdrant_filter: Optional[rest.Filter] = None
+    ) -> List[Document]:
+        """
+        Retrieve context documents using hybrid search with optional filtering and reranking.
+        Args:
+            query: User query
+            top_k: Number of documents to retrieve
+            reports: List of report names to filter by
+            sources: List of sources to filter by
+            subtype: Document subtype to filter by
+            year: Year to filter by
+            use_reranking: Whether to apply reranking
+            qdrant_filter: Pre-built Qdrant filter to use
+        Returns:
+            List of retrieved documents
+        """
+        try:
+            # Determine how many documents to retrieve
+            retrieve_k = k  #* 3 if use_reranking else k  # Retrieve more for reranking
+            # Build search kwargs
+            search_kwargs = {}
+            # Use qdrant_filter if provided (this takes precedence)
+            if qdrant_filter:
+                search_kwargs = {"filter": qdrant_filter}
+                print(f"✅ FILTERS APPLIED: Using inferred Qdrant filter")
+            else:
+                # Build filter from individual parameters
+                filter_obj = create_filter(
+                    reports=reports,
+                    sources=sources,
+                    subtype=subtype,
+                    year=year,
+                    district=district,
+                    filenames=filenames
+                )
+                if filter_obj:
+                    search_kwargs = {"filter": filter_obj}
+                    print(f"✅ FILTERS APPLIED: Using built filter")
+                else:
+                    search_kwargs = {}
+                    print(f"⚠️ NO FILTERS APPLIED: All documents will be searched")
+            # Perform vector search
+            try:
+                # Check if we need ColBERT embeddings for reranking
+                if use_reranking and self.reranker_type == 'colbert':
+                    result = self._similarity_search_with_colbert_embeddings(
+                        query,
+                        k=retrieve_k,
+                        **search_kwargs
+                    )
+                else:
+                    result = self.vectorstore.similarity_search_with_score(
+                        query,
+                        k=retrieve_k,
+                        **search_kwargs
+                    )
+                # Handle different return formats
+                if isinstance(result, tuple) and len(result) == 2:
+                    documents, scores = result
+                elif isinstance(result, list) and len(result) > 0:
+                    # Handle case where result is a list of (Document, score) tuples
+                    documents = []
+                    scores = []
+                    for item in result:
+                        if isinstance(item, tuple) and len(item) == 2:
+                            doc, score = item
+                            documents.append(doc)
+                            scores.append(score)
+                        else:
+                            # Handle case where item is just a Document
+                            documents.append(item)
+                            scores.append(0.0)  # Default score
+                else:
+                    documents = []
+                    scores = []
+                print(f"✅ RETRIEVAL SUCCESS: Retrieved {len(documents)} documents (requested: {retrieve_k})")
+                # If we got fewer documents than requested, try without filters
+                if len(documents) < retrieve_k and search_kwargs.get('filter'):
+                    print(f"⚠️ RETRIEVAL: Got {len(documents)} docs with filters, trying without filters...")
+                    try:
+                        result_no_filter = self.vectorstore.similarity_search_with_score(
+                            query,
+                            k=retrieve_k
+                        )
+                        if isinstance(result_no_filter, tuple) and len(result_no_filter) == 2:
+                            documents_no_filter, scores_no_filter = result_no_filter
+                        elif isinstance(result_no_filter, list):
+                            documents_no_filter = []
+                            scores_no_filter = []
+                            for item in result_no_filter:
+                                if isinstance(item, tuple) and len(item) == 2:
+                                    doc, score = item
+                                    documents_no_filter.append(doc)
+                                    scores_no_filter.append(score)
+                                else:
+                                    documents_no_filter.append(item)
+                                    scores_no_filter.append(0.0)
+                        else:
+                            documents_no_filter = []
+                            scores_no_filter = []
+                        if len(documents_no_filter) > len(documents):
+                            print(f"✅ RETRIEVAL: Got {len(documents_no_filter)} docs without filters")
+                            documents = documents_no_filter
+                            scores = scores_no_filter
+                    except Exception as e:
+                        print(f"⚠️ RETRIEVAL: Fallback search failed: {e}")
+            except Exception as e:
+                print(f"❌ RETRIEVAL ERROR: {str(e)}")
+                return []
+            # Apply reranking if enabled
+            reranking_applied = False
+            if use_reranking and len(documents) > 1:
+                print(f"🔄 RERANKING: Applying {self.reranker_model_name} to {len(documents)} documents...")
+                try:
+                    original_docs = documents.copy()
+                    original_scores = scores.copy()
+                    # Apply reranking
+                    # print(f"🔍 ORIGINAL DOCS: {documents[0]}")
+                    reranked_docs = self._apply_reranking(query, documents, scores)
+                    # print(f"🔍 RERANKED DOCS: {reranked_docs[0]}")
+                    reranking_applied = len(reranked_docs) > 0
+                    if reranking_applied:
+                        print(f"✅ RERANKING APPLIED: {self.reranker_model_name}")
+                        documents = reranked_docs
+                        # Update scores to reflect reranking
+                        # scores = [0.0] * len(documents)  # Reranked scores are not directly comparable
+                    else:
+                        print(f"⚠️ RERANKING FAILED: Using original order")
+                        documents = original_docs
+                        scores = original_scores
+                    return documents
+                except Exception as e:
+                    print(f"❌ RERANKING ERROR: {str(e)}")
+                    print(f"⚠️ RERANKING FAILED: Using original order")
+                    reranking_applied = False
+            elif use_reranking and len(documents) <= 1:
+                print(f"ℹ️ RERANKING: Skipped (only {len(documents)} document(s) retrieved)")
+                if use_reranking:
+                    print(f"ℹ️ RERANKING: Skipped (disabled or insufficient documents)")
+                # Store original scores in metadata
+                for i, (doc, score) in enumerate(zip(documents, scores)):
+                    doc.metadata['original_score'] = float(score)
+                    doc.metadata['reranking_applied'] = False
+                return documents
+            else:
+                print(f"ℹ️ RERANKING: Skipped (disabled or insufficient documents)")
+            # Limit to requested number of documents
+            documents = documents[:k]
+            scores = scores[:k] if scores else [0.0] * len(documents)
+            # Add metadata to documents
+            for i, (doc, score) in enumerate(zip(documents, scores)):
+                if hasattr(doc, 'metadata'):
+                    doc.metadata.update({
+                        'reranking_applied': reranking_applied,
+                        'reranker_model': 'BAAI/bge-reranker-v2-m3' if reranking_applied else None,
+                        'original_rank': i + 1,
+                        'final_rank': i + 1,
+                        'original_score': float(score) if score is not None else 0.0
+                    })
+            return documents
+        except Exception as e:
+            print(f"❌ CONTEXT RETRIEVAL ERROR: {str(e)}")
+            return []
+    def _apply_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking to documents using the appropriate reranker.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        if not self.reranker or len(documents) == 0:
+            return documents
+        try:
+            print(f"🔍 RERANKING METHOD: Starting reranking with {len(documents)} documents")
+            print(f"🔍 RERANKING TYPE: {self.reranker_type.upper()}")
+            if self.reranker_type == 'colbert':
+                return self._apply_colbert_reranking(query, documents, scores)
+            else:
+                return self._apply_crossencoder_reranking(query, documents, scores)
+        except Exception as e:
+            print(f"❌ RERANKING ERROR: {str(e)}")
+            return documents
+    def _apply_crossencoder_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking using CrossEncoder (BGE and other models).
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        # Prepare pairs for reranking
+        pairs = []
+        for doc in documents:
+            pairs.append([query, doc.page_content])
+        print(f"🔍 CROSS-ENCODER: Prepared {len(pairs)} pairs for reranking")
+        # Get reranking scores using the correct CrossEncoder API
+        rerank_scores = self.reranker.predict(pairs)
+        # Handle single score case
+        if not isinstance(rerank_scores, (list, np.ndarray)):
+            rerank_scores = [rerank_scores]
+        # Ensure we have the right number of scores
+        if len(rerank_scores) != len(documents):
+            print(f"⚠️ RERANKING WARNING: Expected {len(documents)} scores, got {len(rerank_scores)}")
+            return documents
+        print(f"🔍 CROSS-ENCODER: Got {len(rerank_scores)} rerank scores")
+        print(f"🔍 CROSS-ENCODER SCORES: {rerank_scores[:5]}...")  # Show first 5 scores
+        # Combine documents with their rerank scores
+        doc_scores = list(zip(documents, rerank_scores))
+        # Sort by rerank score (descending)
+        doc_scores.sort(key=lambda x: x[1], reverse=True)
+        # Extract reranked documents and store scores in metadata
+        reranked_docs = []
+        for i, (doc, rerank_score) in enumerate(doc_scores):
+            # Find original index for original score
+            original_idx = documents.index(doc)
+            original_score = scores[original_idx] if original_idx < len(scores) else 0.0
+            # Create new document with reranking metadata
+            new_doc = Document(
+                page_content=doc.page_content,
+                metadata={
+                    **doc.metadata,
+                    'reranking_applied': True,
+                    'reranker_model': self.reranker_model_name,
+                    'reranker_type': self.reranker_type,
+                    'original_rank': original_idx + 1,
+                    'final_rank': i + 1,
+                    'original_score': float(original_score),
+                    'reranked_score': float(rerank_score)
+                }
+            )
+            reranked_docs.append(new_doc)
+        print(f"✅ CROSS-ENCODER: Reranked {len(reranked_docs)} documents")
+        return reranked_docs
+    def _apply_colbert_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking using ColBERT late interaction.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        # Use the actual ColBERT reranking implementation
+        return self._colbert_rerank(query, documents, scores)
+    def _colbert_rerank(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        ColBERT reranking using late interaction with pre-calculated embeddings support.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        try:
+            print(f"🔍 COLBERT: Starting late interaction reranking with {len(documents)} documents")
+            # Check if documents have pre-calculated ColBERT embeddings
+            pre_calculated_embeddings = []
+            documents_without_embeddings = []
+            documents_without_indices = []
+            for i, doc in enumerate(documents):
+                if (hasattr(doc, 'metadata') and
+                    'colbert_embedding' in doc.metadata and
+                    doc.metadata['colbert_embedding'] is not None):
+                    # Use pre-calculated embedding
+                    colbert_embedding = doc.metadata['colbert_embedding']
+                    if isinstance(colbert_embedding, list):
+                        colbert_embedding = torch.tensor(colbert_embedding)
+                    pre_calculated_embeddings.append(colbert_embedding)
+                else:
+                    # Need to calculate embedding
+                    documents_without_embeddings.append(doc)
+                    documents_without_indices.append(i)
+            # Calculate query embedding
+            query_embeddings = self.colbert_checkpoint.queryFromText([query])
+            # Calculate embeddings for documents without pre-calculated ones
+            if documents_without_embeddings:
+                print(f"🔄 COLBERT: Calculating embeddings for {len(documents_without_embeddings)} documents without pre-calculated embeddings")
+                doc_texts = [doc.page_content for doc in documents_without_embeddings]
+                doc_embeddings = self.colbert_checkpoint.docFromText(doc_texts)
+                # Insert calculated embeddings into the right positions
+                for i, embedding in enumerate(doc_embeddings):
+                    idx = documents_without_indices[i]
+                    pre_calculated_embeddings.insert(idx, embedding)
+            else:
+                print(f"✅ COLBERT: Using pre-calculated embeddings for all {len(documents)} documents")
+            # Calculate late interaction scores
+            # ColBERT uses MaxSim: for each query token, find max similarity with document tokens
+            colbert_scores = []
+            for i, doc_embedding in enumerate(pre_calculated_embeddings):
+                # Calculate similarity matrix between query and document i
+                sim_matrix = torch.matmul(query_embeddings[0], doc_embedding.transpose(-1, -2))
+                # MaxSim: for each query token, take max similarity with document
+                max_sim_per_query_token = torch.max(sim_matrix, dim=-1)[0]
+                # Sum over query tokens to get final score
+                final_score = torch.sum(max_sim_per_query_token).item()
+                colbert_scores.append(final_score)
+            # Sort documents by ColBERT scores
+            doc_scores = list(zip(documents, colbert_scores))
+            doc_scores.sort(key=lambda x: x[1], reverse=True)
+            # Create reranked documents with metadata
+            reranked_docs = []
+            for i, (doc, colbert_score) in enumerate(doc_scores):
+                original_idx = documents.index(doc)
+                original_score = scores[original_idx] if original_idx < len(scores) else 0.0
+                new_doc = Document(
+                    page_content=doc.page_content,
+                    metadata={
+                        **doc.metadata,
+                        'reranking_applied': True,
+                        'reranker_model': self.reranker_model_name,
+                        'reranker_type': self.reranker_type,
+                        'original_rank': original_idx + 1,
+                        'final_rank': i + 1,
+                        'original_score': float(original_score),
+                        'reranked_score': float(colbert_score),
+                        'colbert_score': float(colbert_score),
+                        'colbert_embedding_pre_calculated': 'colbert_embedding' in doc.metadata
+                    }
+                )
+                reranked_docs.append(new_doc)
+            print(f"✅ COLBERT: Reranked {len(reranked_docs)} documents using late interaction")
+            print(f"🔍 COLBERT SCORES: {[f'{score:.4f}' for score in colbert_scores[:5]]}...")
+            return reranked_docs
+        except Exception as e:
+            print(f"❌ COLBERT RERANKING ERROR: {str(e)}")
+            print(f"❌ COLBERT TRACEBACK: {traceback.format_exc()}")
+            # Fallback to original order - return documents as-is
+            return documents
+    def retrieve_with_scores(self, query: str, vectorstore=None, k: int = 5, reports: List[str] = None,
+                            sources: List[str] = None, subtype: List[str] = None,
+                            year: List[str] = None, use_reranking: bool = False,
+                            qdrant_filter: Optional[rest.Filter] = None) -> Tuple[List[Document], List[float]]:
+        """
+        Retrieve context documents with scores using hybrid search with optional reranking.
+        Args:
+            query: User query
+            vectorstore: Optional vectorstore instance (for compatibility)
+            k: Number of documents to retrieve
+            reports: List of report names to filter by
+            sources: List of sources to filter by
+            subtype: Document subtype to filter by
+            year: List of years to filter by
+            use_reranking: Whether to apply reranking
+            qdrant_filter: Pre-built Qdrant filter
+        Returns:
+            Tuple of (documents, scores)
+        """
+        try:
+            # Use the provided vectorstore if available, otherwise use the instance one
+            if vectorstore:
+                self.vectorstore = vectorstore
+            # Determine search strategy
+            search_strategy = self.config.get('retrieval', {}).get('search_strategy', 'vector_only')
+            if search_strategy == 'vector_only':
+                # Vector search only
+                print(f"🔄 VECTOR SEARCH: Retrieving {k} documents...")
+                if qdrant_filter:
+                    print(f"✅ QDRANT FILTER APPLIED: Using inferred Qdrant filter")
+                    # Pass filter as positional argument, not keyword argument
+                    results = self.vectorstore.similarity_search_with_score(
+                        query,
+                        k=k,
+                        filter=qdrant_filter
+                    )
+                else:
+                    # Build filter from individual parameters
+                    filter_conditions = self._build_filter_conditions(reports, sources, subtype, year)
+                    if filter_conditions:
+                        print(f"✅ FILTER APPLIED: {filter_conditions}")
+                        results = self.vectorstore.similarity_search_with_score(
+                            query,
+                            k=k,
+                            filter=filter_conditions
+                        )
+                    else:
+                        print(f"ℹ️ NO FILTERS APPLIED: All documents will be searched")
+                        results = self.vectorstore.similarity_search_with_score(query, k=k)
+                print(f"🔍 SEARCH DEBUG: Raw result type: {type(results)}")
+                print(f"🔍 SEARCH DEBUG: Raw result length: {len(results)}")
+                # Handle different result formats
+                if results and isinstance(results[0], tuple):
+                    documents = [doc for doc, score in results]
+                    scores = [score for doc, score in results]
+                    print(f"🔍 SEARCH DEBUG: After unpacking - documents: {len(documents)}, scores: {len(scores)}")
+                else:
+                    documents = results
+                    scores = [0.0] * len(documents)
+                    print(f"🔍 SEARCH DEBUG: No scores available, using default")
+                print(f"🔧 CONVERTING: Converting {len(documents)} documents")
+                # Convert to Document objects and store original scores
+                final_documents = []
+                for i, (doc, score) in enumerate(zip(documents, scores)):
+                    if hasattr(doc, 'page_content'):
+                        new_doc = Document(
+                            page_content=doc.page_content,
+                            metadata=doc.metadata.copy()
+                        )
+                        # Store original score in metadata
+                        new_doc.metadata['original_score'] = float(score) if score is not None else 0.0
+                        final_documents.append(new_doc)
+                    else:
+                        print(f"⚠️ WARNING: Document {i} has no page_content")
+                print(f"✅ RETRIEVAL SUCCESS: Retrieved {len(final_documents)} documents")
+                # Apply reranking if enabled
+                if use_reranking and len(final_documents) > 1:
+                    print(f"🔄 RERANKING: Applying {self.reranker_model} to {len(final_documents)} documents...")
+                    final_documents = self._apply_reranking(query, final_documents, scores)
+                    print(f"✅ RERANKING APPLIED: {self.reranker_model}")
+                else:
+                    print(f"ℹ️ RERANKING: Skipped (disabled or no documents)")
+                return final_documents, scores
+            else:
+                print(f"❌ UNSUPPORTED STRATEGY: {search_strategy}")
+                return [], []
+        except Exception as e:
+            print(f"❌ RETRIEVAL ERROR: {e}")
+            print(f"❌ RETRIEVAL TRACEBACK: {traceback.format_exc()}")
+            return [], []
+    def _build_filter_conditions(self, reports: List[str] = None, sources: List[str] = None,
+                                subtype: List[str] = None, year: List[str] = None) -> Optional[rest.Filter]:
+        """
+        Build Qdrant filter conditions from individual parameters.
+        Args:
+            reports: List of report names
+            sources: List of sources
+            subtype: Document subtype
+            year: List of years
+        Returns:
+            Qdrant filter or None
+        """
+        conditions = []
+        if reports:
+            conditions.append(rest.FieldCondition(
+                key="metadata.filename",
+                match=rest.MatchAny(any=reports)
+            ))
+        if sources:
+            conditions.append(rest.FieldCondition(
+                key="metadata.source",
+                match=rest.MatchAny(any=sources)
+            ))
+        if subtype:
+            conditions.append(rest.FieldCondition(
+                key="metadata.subtype",
+                match=rest.MatchAny(any=subtype)
+            ))
+        if year:
+            conditions.append(rest.FieldCondition(
+                key="metadata.year",
+                match=rest.MatchAny(any=year)
+            ))
+        if conditions:
+            return rest.Filter(must=conditions)
+        return None
+def get_context(
+    query: str,
+    vectorstore: Qdrant,
+    k: int = 5,
+    reports: Optional[List[str]] = None,
+    sources: Optional[List[str]] = None,
+    subtype: Optional[str] = None,
+    year: Optional[str] = None,
+    use_reranking: bool = False,
+    qdrant_filter: Optional[rest.Filter] = None
+) -> List[Document]:
+    """
+    Convenience function to get context documents.
+    Args:
+        query: User query
+        vectorstore: Qdrant vector store instance
+        k: Number of documents to retrieve
+        reports: Optional list of report names to filter by
+        sources: Optional list of source categories to filter by
+        subtype: Optional subtype to filter by
+        year: Optional year to filter by
+        use_reranking: Whether to apply reranking
+        qdrant_filter: Optional pre-built Qdrant filter
+    Returns:
+        List of retrieved documents
+    """
+    retriever = ContextRetriever(vectorstore)
+    return retriever.retrieve_context(
+        query=query,
+        k=k,
+        reports=reports,
+        sources=sources,
+        subtype=subtype,
+        year=year,
+        use_reranking=use_reranking,
+        qdrant_filter=qdrant_filter
+    )
+def format_context_for_llm(documents: List[Document]) -> str:
+    """
+    Format retrieved documents for LLM input.
+    Args:
+        documents: List of Document objects
+    Returns:
+        Formatted string for LLM
+    """
+    if not documents:
+        return ""
+    formatted_parts = []
+    for i, doc in enumerate(documents, 1):
+        content = doc.page_content.strip()
+        source = doc.metadata.get('filename', 'Unknown')
+        formatted_parts.append(f"Document {i} (Source: {source}):\n{content}")
+    return "\n\n".join(formatted_parts)
+def get_context_metadata(documents: List[Document]) -> Dict[str, Any]:
+    """
+    Extract metadata summary from retrieved documents.
+    Args:
+        documents: List of Document objects
+    Returns:
+        Dictionary with metadata summary
+    """
+    if not documents:
+        return {}
+    sources = set()
+    years = set()
+    doc_types = set()
+    for doc in documents:
+        metadata = doc.metadata
+        if 'filename' in metadata:
+            sources.add(metadata['filename'])
+        if 'year' in metadata:
+            years.add(metadata['year'])
+        if 'source' in metadata:
+            doc_types.add(metadata['source'])
+    return {
+        "num_documents": len(documents),
+        "sources": list(sources),
+        "years": list(years),
+        "document_types": list(doc_types)
+    }

src/retrieval/filter.py ADDED Viewed

	@@ -0,0 +1,975 @@

+"""Document filtering utilities for Qdrant vector store."""
+from typing import List, Optional, Union, Dict, Tuple, Any
+from qdrant_client.http import models as rest
+import time
+class FilterBuilder:
+    """Builder class for creating Qdrant filters."""
+    def __init__(self):
+        self.conditions = []
+    def add_source_filter(self, source: Union[str, List[str]]) -> 'FilterBuilder':
+        """Add source filter condition."""
+        if source:
+            if isinstance(source, list):
+                condition = rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchAny(any=source)
+                )
+                print(f"🔧 FilterBuilder: Added source filter for {source}")
+            else:
+                condition = rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchValue(value=source)
+                )
+                print(f"🔧 FilterBuilder: Added source filter for '{source}'")
+            self.conditions.append(condition)
+        return self
+    def add_filename_filter(self, filenames: List[str]) -> 'FilterBuilder':
+        """Add filename filter condition."""
+        if filenames:
+            condition = rest.FieldCondition(
+                key="metadata.filename",
+                match=rest.MatchAny(any=filenames)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added filename filter for {filenames}")
+        return self
+    def add_year_filter(self, years: List[str]) -> 'FilterBuilder':
+        """Add year filter condition."""
+        if years:
+            condition = rest.FieldCondition(
+                key="metadata.year",
+                match=rest.MatchAny(any=years)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added year filter for {years}")
+        return self
+    def add_district_filter(self, districts: List[str]) -> 'FilterBuilder':
+        """Add district filter condition."""
+        if districts:
+            condition = rest.FieldCondition(
+                key="metadata.district",
+                match=rest.MatchAny(any=districts)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added district filter for {districts}")
+        return self
+    def add_custom_filter(self, key: str, value: Union[str, List[str]]) -> 'FilterBuilder':
+        """Add custom filter condition."""
+        if isinstance(value, list):
+            condition = rest.FieldCondition(
+                key=key,
+                match=rest.MatchAny(any=value)
+            )
+        else:
+            condition = rest.FieldCondition(
+                key=key,
+                match=rest.MatchValue(value=value)
+            )
+        self.conditions.append(condition)
+        return self
+    def build(self) -> rest.Filter:
+        """Build the final filter."""
+        if not self.conditions:
+            return None
+        return rest.Filter(must=self.conditions)
+def create_filter(
+    reports: List[str] = None,
+    sources: Union[str, List[str]] = None,
+    subtype: List[str] = None,
+    year: List[str] = None,
+    district: List[str] = None,
+    filenames: List[str] = None
+) -> rest.Filter:
+    """
+    Create a search filter for Qdrant (legacy function for compatibility).
+    Args:
+        reports: List of specific report filenames
+        sources: Source category
+        subtype: List of subtypes/filenames
+        year: List of years
+        district: List of districts
+        filenames: List of specific filenames (mutually exclusive with other filters)
+    Returns:
+        Qdrant Filter object
+    Note:
+        If filenames are provided, ONLY filename filtering is applied (mutually exclusive)
+    """
+    builder = FilterBuilder()
+    # Check if filename filtering is requested (mutually exclusive)
+    # Both filenames and reports serve the same purpose (backward compatibility)
+    # Prefer filenames, fallback to reports for legacy support
+    target_filenames = filenames if filenames else reports
+    if target_filenames and len(target_filenames) > 0:
+        # ONLY apply filename filter, ignore all other filters
+        print(f"🔍 FILTER APPLIED: Filenames = {target_filenames} (mutually exclusive mode)")
+        builder.add_filename_filter(target_filenames)
+    else:
+        # Otherwise, filter by source and subtype
+        print(f"🔍 FILTER APPLIED: Sources = {sources}, Subtype = {subtype}, Year = {year}, District = {district}")
+        if sources:
+            print(f"✅ Adding source filter: metadata.source = '{sources}'")
+            builder.add_source_filter(sources)
+        if subtype:
+            print(f"✅ Adding subtype filter: metadata.filename IN {subtype}")
+            builder.add_filename_filter(subtype)
+        if year:
+            print(f"✅ Adding year filter: metadata.year IN {year}")
+            builder.add_year_filter(year)
+        if district:
+            print(f"✅ Adding district filter: metadata.district IN {district}")
+            builder.add_district_filter(district)
+    filter_obj = builder.build()
+    if filter_obj:
+        print(f"�� FINAL FILTER: {len(filter_obj.must)} condition(s) applied")
+        for i, condition in enumerate(filter_obj.must, 1):
+            print(f"   Condition {i}: {condition.key} = {condition.match}")
+    else:
+        print("⚠️ NO FILTERS APPLIED: All documents will be searched")
+    return filter_obj
+def create_advanced_filter(
+    must_conditions: List[dict] = None,
+    should_conditions: List[dict] = None,
+    must_not_conditions: List[dict] = None
+) -> rest.Filter:
+    """
+    Create advanced filter with multiple condition types.
+    Args:
+        must_conditions: Conditions that must match
+        should_conditions: Conditions that should match (OR logic)
+        must_not_conditions: Conditions that must not match
+    Returns:
+        Qdrant Filter object
+    """
+    filter_dict = {}
+    if must_conditions:
+        filter_dict["must"] = [
+            _dict_to_field_condition(cond) for cond in must_conditions
+        ]
+    if should_conditions:
+        filter_dict["should"] = [
+            _dict_to_field_condition(cond) for cond in should_conditions
+        ]
+    if must_not_conditions:
+        filter_dict["must_not"] = [
+            _dict_to_field_condition(cond) for cond in must_not_conditions
+        ]
+    if not filter_dict:
+        return None
+    return rest.Filter(**filter_dict)
+def _dict_to_field_condition(condition_dict: dict) -> rest.FieldCondition:
+    """Convert dictionary to FieldCondition."""
+    key = condition_dict["key"]
+    value = condition_dict["value"]
+    if isinstance(value, list):
+        match = rest.MatchAny(any=value)
+    else:
+        match = rest.MatchValue(value=value)
+    return rest.FieldCondition(key=key, match=match)
+def validate_filter(filter_obj: rest.Filter) -> bool:
+    """
+    Validate that a filter object is properly constructed.
+    Args:
+        filter_obj: Qdrant Filter object
+    Returns:
+        True if valid, raises ValueError if invalid
+    """
+    if filter_obj is None:
+        return True
+    if not isinstance(filter_obj, rest.Filter):
+        raise ValueError("Filter must be a rest.Filter object")
+    # Check that at least one condition type is present
+    has_conditions = any([
+        hasattr(filter_obj, 'must') and filter_obj.must,
+        hasattr(filter_obj, 'should') and filter_obj.should,
+        hasattr(filter_obj, 'must_not') and filter_obj.must_not
+    ])
+    if not has_conditions:
+        raise ValueError("Filter must have at least one condition")
+    return True
+def infer_filters_from_query(
+    query: str,
+    available_metadata: dict,
+    llm_client=None
+) -> Tuple[rest.Filter, Union[dict, None]]:
+    """
+    Automatically infer filters from a query using LLM analysis.
+    Args:
+        query: User query to analyze
+        available_metadata: Available metadata values in the vectorstore
+        llm_client: LLM client for analysis (optional)
+    Returns:
+        Qdrant Filter object with inferred conditions
+    """
+    print(f"�� AUTO-INFERRING FILTERS from query: '{query[:50]}...'")
+    # Check if LLM client is available
+    if not llm_client:
+        print(f"❌ LLM CLIENT MISSING: Cannot use LLM analysis, falling back to rule-based")
+        return _infer_filters_rule_based(query, available_metadata), None
+    # Extract available options
+    available_sources = available_metadata.get('sources', [])
+    available_years = available_metadata.get('years', [])
+    available_filenames = available_metadata.get('filenames', [])
+    print(f"📊 Available metadata: sources={len(available_sources)}, years={len(available_years)}, filenames={len(available_filenames)}")
+    # Try LLM analysis first
+    print(f" LLM ANALYSIS: Attempting LLM-based filter inference...")
+    llm_result = _analyze_query_with_llm(
+        query=query,
+        available_metadata=available_metadata,
+        llm_client=llm_client
+    )
+    if llm_result:
+        print(f"✅ LLM SUCCESS: LLM successfully inferred filters")
+        # Use the _build_qdrant_filter function to properly build the Qdrant filter
+        qdrant_filter, filter_summary = _build_qdrant_filter(llm_result)
+        if qdrant_filter:
+            print(f"✅ QDRANT FILTER: Successfully built Qdrant filter")
+            # print(f"✅ INFERRED FILTERS: {qdrant_filter}")
+            return qdrant_filter, filter_summary
+        else:
+            print(f"❌ QDRANT FILTER: Failed to build Qdrant filter, trying rule-based fallback")
+            rule_based_result = _infer_filters_rule_based(query, available_metadata)
+            # Use the _build_qdrant_filter function to properly build the Qdrant filter
+            qdrant_filter, filter_summary = _build_qdrant_filter(rule_based_result)
+            if qdrant_filter:
+                print(f"✅ RULE-BASED QDRANT FILTER: Successfully built Qdrant filter")
+                return qdrant_filter, filter_summary
+            else:
+                print(f"❌ RULE-BASED QDRANT FILTER: Failed to build Qdrant filter")
+                return None, None
+    else:
+        print(f"⚠️ LLM FAILED: LLM could not infer filters, trying rule-based fallback")
+        rule_based_result = _infer_filters_rule_based(query, available_metadata)
+        # Use the _build_qdrant_filter function to properly build the Qdrant filter
+        qdrant_filter, filter_summary = _build_qdrant_filter(rule_based_result)
+        if qdrant_filter:
+            print(f"✅ RULE-BASED QDRANT FILTER: Successfully built Qdrant filter")
+            return qdrant_filter, filter_summary
+        else:
+            print(f"❌ RULE-BASED QDRANT FILTER: Failed to build Qdrant filter")
+            return None, None
+def _analyze_query_with_llm(
+    query: str,
+    available_metadata: Dict[str, List[str]],
+    llm_client=None
+) -> dict:
+    """
+    - Filenames: {available_metadata.get('filenames', [])}
+    📁 FILENAME FILTERING (Use Sparingly):
+    - Only if specific filename explicitly mentioned
+    - Prefer source/subtype over filename
+    - Be very conservative
+            "filenames": ["filename1", "filename2"] or [],
+            - For filenames: Only use if you have high confidence and can identify specific files
+    """
+    """
+    Use LLM to analyze query and infer appropriate filters.
+    Args:
+        query: User query to analyze
+        available_metadata: Available metadata values in the vectorstore
+        llm_client: LLM client for analysis
+    Returns:
+        Dictionary with inferred filters or empty dict if failed
+    """
+    if not llm_client:
+        print("❌ LLM CLIENT MISSING: Cannot analyze query without LLM client")
+        return {}
+    try:
+        print(f" LLM ANALYSIS: Analyzing query with LLM...")
+        """
+        For example: "What is the expected ... in 2024" - this refference to a future statement, so retrieving documents for 2023, 2022 and 2021 can be relevant too
+        Another example: "What is the GDP increase now compared to 2022" - this is a relative statement, refferring to past data, so both Year 2022, and now - 2025 needs to be detected/marked
+        """
+        # Create prompt for LLM analysis
+        prompt = f"""
+You are a filter inference system. Analyze this query and return ONLY a JSON object.
+Query: "{query}"
+Available metadata:
+- Sources: {available_metadata.get('sources', [])}
+- Years: {available_metadata.get('years', [])}
+FILTER INFERENCE GUIDELINES:
+ YEAR FILTERING (Be VERY Conservative):
+✅ INFER YEARS ONLY IF:
+    - Explicit 4-digit years: "2022", "2023", "2021"
+    - Clear relative terms: "last year", "this year", "recent", "current year" (for the context, now is 2025)
+    - Temporal context: "annual report 2022", "audit for 2023"
+    - Give multiple years for complex queries.
+❌ DO NOT INFER YEARS FOR:
+    - Vague terms: "implementation", "activities", "costs", "challenges", "issues"
+    - General concepts: "PDM", "administrative", "budget", "staff"
+    - Process descriptions: "how were", "what challenges", "management of"
+🏛️ SOURCE FILTERING (Context-Based):
+    - "Ministry, Department and Agency" → Central government, ministries, departments, PS/ST
+    - "Local Government" → Districts, municipalities, local authorities, DLG
+    - "Consolidated" → Annual consolidated reports, OAG reports
+    - "Thematic" → Special studies, thematic reports
+�� SUBTYPE FILTERING (Document Type):
+    - "audit" → Audit reports, reviews, examinations
+    - "report" → General reports, annual reports
+    - "guidance" → Guidelines, directives, circulars
+CONFIDENCE SCORING:
+    - 0.9-1.0: Crystal clear indicators (explicit years, specific sources)
+    - 0.7-0.8: Good indicators (relative years, clear context)
+    - 0.5-0.6: Moderate indicators (some context clues)
+    - 0.0-0.4: Low confidence (vague or unclear)
+EXAMPLES:
+    ✅ "What challenges arose in 2022?" → years: ["2022"], confidence: 1
+    ✅ "How were administrative costs managed in our government?" → sources: ["Local Government"], confidence: 0.75
+    ✅ "PDM implementation guidelines from last year" → years: ["2024"], confidence: 0.9
+    ❌ "What issues arose with budget execution?" → NO FILTERS, confidence: 0.2
+    ❌ "How were tools related to administrative costs?" → NO FILTERS, confidence: 0.1
+RESPONSE FORMAT (JSON only):
+    {{
+        "years": ["2022", "2023"] or [],
+        "sources": ["Ministry, Department and Agency", "Local Government"] or [],
+        "subtype": ["audit", "report"] or [],
+        "confidence": 0.8,
+        "reasoning": "Very brief explanation of filter choices"
+    }}
+Rules:
+- Use OR logic (SHOULD) for multiple values
+- Prefer sources over filenames
+- Only include years if clearly mentioned
+- Return null for unclear fields
+- For sources/subtypes: Include at least 3 candidates unless confidence is high and you can identify exactly one source (MUST)
+- For years: If you want to include, then include at least 2 candidates unless confidence is high and you can identify exactly one year (MUST)
+"""
+        print(f"🔄 LLM CALL: Sending prompt to LLM...")
+        try:
+            # Try different methods to call the LLM
+            if hasattr(llm_client, 'invoke'):
+                response = llm_client.invoke(prompt)
+            elif hasattr(llm_client, 'generate'):
+                response = llm_client.generate([{"role": "user", "content": prompt}])
+            elif hasattr(llm_client, 'call'):
+                response = llm_client.call(prompt)
+            elif hasattr(llm_client, 'predict'):
+                response = llm_client.predict(prompt)
+            else:
+                # Try to call it directly
+                response = llm_client(prompt)
+            print(f"✅ LLM CALL SUCCESS: Received response from LLM")
+            # Extract content from response
+            if hasattr(response, 'content'):
+                response_content = response.content
+            elif hasattr(response, 'text'):
+                response_content = response.text
+            elif isinstance(response, str):
+                response_content = response
+            else:
+                response_content = str(response)
+            print(f"🔄 LLM RESPONSE: {response_content[:200]}...")
+        except Exception as e:
+            print(f"❌ LLM CALL FAILED: Error calling LLM - {e}")
+            return {}
+        # Parse JSON response
+        import json
+        import re
+        try:
+            print(f"🔄 JSON PARSING: Attempting to parse LLM response...")
+            # Clean the response to extract JSON from markdown
+            response_text = response_content.strip()
+            # Remove markdown formatting if present
+            if "```json" in response_text:
+                # Extract JSON from markdown code block
+                start_marker = "```json"
+                end_marker = "```"
+                start_idx = response_text.find(start_marker)
+                if start_idx != -1:
+                    start_idx += len(start_marker)
+                    end_idx = response_text.find(end_marker, start_idx)
+                    if end_idx != -1:
+                        response_text = response_text[start_idx:end_idx].strip()
+            # Try to find JSON object in the response
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                response_text = json_match.group(0)
+            print(f"🔄 JSON PARSING: Cleaned response: {response_text[:200]}...")
+            # Parse JSON
+            filters = json.loads(response_text)
+            print(f"✅ JSON PARSING SUCCESS: Parsed filters: {filters}")
+            # Validate filters
+            if not isinstance(filters, dict):
+                print(f"❌ JSON VALIDATION FAILED: Response is not a dictionary")
+                return {}
+            # Check if any filters were inferred
+            has_filters = any(filters.get(key) for key in ['sources', 'years', 'filenames'])
+            if not has_filters:
+                print(f"⚠️ QUERY DIFFICULT: LLM could not determine appropriate filters from query")
+                return {}
+            # print(f"✅ FILTER INFERENCE SUCCESS: Inferred filters: {filters}")
+            return filters
+        except json.JSONDecodeError as e:
+            print(f"❌ JSON PARSING FAILED: Invalid JSON format - {e}")
+            print(f"❌ JSON PARSING FAILED: Raw response: {response_text[:500]}...")
+            return {}
+        except Exception as e:
+            print(f"❌ JSON PARSING FAILED: Unexpected error - {e}")
+            print(f"❌ JSON PARSING FAILED: Raw response: {response_text[:500]}...")
+            return {}
+    except Exception as e:
+        print(f"❌ LLM CALL FAILED: Error calling LLM - {e}")
+        return {}
+def _infer_filters_rule_based(
+    query: str,
+    available_metadata: dict
+) -> dict:
+    """
+    Rule-based fallback for filter inference with improved logic.
+    Args:
+        query: User query
+        available_metadata: Available metadata values in the vectorstore
+    Returns:
+        Dictionary of inferred filters
+    """
+    print(f" RULE-BASED ANALYSIS: Starting rule-based inference for query: '{query[:50]}...'")
+    inferred = {}
+    query_lower = query.lower()
+    # SEMANTIC SOURCE INFERENCE - Use semantic understanding
+    source_matches = []
+    # Define semantic mappings for better source inference
+    source_keywords = {
+        'consolidated': ['consolidated', 'annual', 'oag', 'auditor general', 'government', 'financial statements', 'budget', 'expenditure', 'revenue'],
+        'military': ['military', 'defence', 'defense', 'army', 'navy', 'air force', 'security', 'defense ministry'],
+        'departmental': ['department', 'ministry', 'agency', 'authority', 'commission', 'board', 'directorate'],
+        'thematic': ['thematic', 'sector', 'program', 'project', 'initiative', 'development', 'infrastructure']
+    }
+    for source in available_metadata.get('sources', []):
+        source_lower = source.lower()
+        # Direct keyword match
+        if source_lower in query_lower:
+            source_matches.append(source)
+            print(f"✅ DIRECT MATCH: Found direct keyword match for '{source}'")
+        else:
+            # Semantic keyword matching
+            if source_lower in source_keywords:
+                keywords = source_keywords[source_lower]
+                matches = sum(1 for keyword in keywords if keyword in query_lower)
+                if matches >= 2:  # Require at least 2 keyword matches for semantic inference
+                    source_matches.append(source)
+                    print(f"✅ SEMANTIC MATCH: Found {matches} semantic keywords for '{source}': {[k for k in keywords if k in query_lower]}")
+    if source_matches:
+        # Use SHOULD (OR logic) for multiple sources
+        inferred['sources_should'] = source_matches
+        print(f"✅ SOURCE INFERENCE: Found {len(source_matches)} sources with OR logic: {source_matches}")
+    else:
+        print("❌ SOURCE INFERENCE: No source keywords found in query")
+    # Infer year filters - use SHOULD (OR logic) for multiple years
+    import re
+    year_matches = []
+    for year in available_metadata.get('years', []):
+        if year in query or f"'{year}" in query:
+            year_matches.append(year)
+    if year_matches:
+        # Use SHOULD (OR logic) for multiple years
+        inferred['years_should'] = year_matches
+        print(f"✅ YEAR INFERENCE: Found {len(year_matches)} years with OR logic: {year_matches}")
+    else:
+        print("❌ YEAR INFERENCE: No year references found in query")
+    # Only infer filename filters if no year filter was found (to avoid conflicts)
+    if not year_matches:
+        filename_matches = []
+        for filename in available_metadata.get('filenames', []):
+            # Only match if multiple words from filename appear in query
+            filename_words = filename.lower().split()
+            matches = sum(1 for word in filename_words if word in query_lower)
+            if matches >= 2:  # High confidence threshold
+                filename_matches.append(filename)
+        if filename_matches:
+            # Use SHOULD (OR logic) for multiple filenames
+            inferred['filenames_should'] = filename_matches
+            print(f"✅ FILENAME INFERENCE: Found {len(filename_matches)} filenames with OR logic: {filename_matches}")
+        else:
+            print("❌ FILENAME INFERENCE: No high-confidence filename matches found")
+    else:
+        print("ℹ️ FILENAME INFERENCE: Skipped (year filter already applied to avoid conflicts)")
+    print(f" RULE-BASED RESULT: {inferred}")
+    return inferred
+def _validate_inferred_filters(inferred_filters: dict) -> dict:
+    """
+    Validate and normalize inferred filters to ensure they're in the expected format.
+    Args:
+        inferred_filters: Raw inferred filters dictionary
+    Returns:
+        Validated and normalized filters dictionary
+    """
+    if not isinstance(inferred_filters, dict):
+        print(f"⚠️ FILTER VALIDATION: Inferred filters is not a dict: {type(inferred_filters)}")
+        return {}
+    validated = {}
+    # Normalize field names and validate values
+    for field_name in ['sources', 'sources_should', 'years', 'years_should', 'filenames', 'filenames_should']:
+        if field_name in inferred_filters and inferred_filters[field_name]:
+            value = inferred_filters[field_name]
+            if isinstance(value, list) and len(value) > 0:
+                # Remove any None or empty string values
+                clean_value = [v for v in value if v is not None and str(v).strip()]
+                if clean_value:
+                    validated[field_name] = clean_value
+                    print(f"✅ FILTER VALIDATION: {field_name} = {clean_value}")
+            elif isinstance(value, str) and value.strip():
+                validated[field_name] = [value.strip()]
+                print(f"✅ FILTER VALIDATION: {field_name} = [{value.strip()}]")
+    return validated
+def _build_qdrant_filter(inferred_filters: dict) -> rest.Filter:
+    """
+    Build Qdrant filter from inferred filters.
+    Args:
+        inferred_filters: Dictionary with inferred filter values
+    Returns:
+        Qdrant Filter object
+    """
+    try:
+        from qdrant_client.http import models as rest
+        # Validate and normalize the inferred filters first
+        validated_filters = _validate_inferred_filters(inferred_filters)
+        if not validated_filters:
+            print(f"⚠️ NO VALID FILTERS: All filters were invalid or empty")
+            return None, {}
+        conditions = []
+        filter_summary = {}
+        # Handle sources (use OR logic for multiple values)
+        # Support both 'sources' and 'sources_should' field names
+        source_values = None
+        if 'sources' in validated_filters and validated_filters['sources']:
+            source_values = validated_filters['sources']
+        elif 'sources_should' in validated_filters and validated_filters['sources_should']:
+            source_values = validated_filters['sources_should']
+        if source_values and isinstance(source_values, list) and len(source_values) > 0:
+            if len(source_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchValue(value=source_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchAny(any=source_values)
+                ))
+            filter_summary['sources'] = f"SHOULD: {source_values}"
+        # Handle years (use OR logic for multiple values)
+        # Support both 'years' and 'years_should' field names
+        year_values = None
+        if 'years' in validated_filters and validated_filters['years']:
+            year_values = validated_filters['years']
+        elif 'years_should' in validated_filters and validated_filters['years_should']:
+            year_values = validated_filters['years_should']
+        if year_values and isinstance(year_values, list) and len(year_values) > 0:
+            if len(year_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.year",
+                    match=rest.MatchValue(value=year_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.year",
+                    match=rest.MatchAny(any=year_values)
+                ))
+            filter_summary['years'] = f"SHOULD: {year_values}"
+        # Handle filenames (use OR logic for multiple values)
+        # Support both 'filenames' and 'filenames_should' field names
+        filename_values = None
+        if 'filenames' in validated_filters and validated_filters['filenames']:
+            filename_values = validated_filters['filenames']
+        elif 'filenames_should' in validated_filters and validated_filters['filenames_should']:
+            filename_values = validated_filters['filenames_should']
+        if filename_values and isinstance(filename_values, list) and len(filename_values) > 0:
+            if len(filename_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.filename",
+                    match=rest.MatchValue(value=filename_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.filename",
+                    match=rest.MatchAny(any=filename_values)
+                ))
+            filter_summary['filenames'] = f"SHOULD: {filename_values}"
+        # Build final filter
+        if conditions:
+            # Always wrap conditions in a Filter object, even for single conditions
+            result_filter = rest.Filter(must=conditions)
+            # Print clean filter summary
+            print(f"✅ APPLIED FILTERS: {filter_summary}")
+            return result_filter, filter_summary
+        else:
+            print(f"⚠️ NO FILTERS APPLIED: All documents will be searched")
+            return None, {}
+    except Exception as e:
+        print(f"❌ FILTER BUILD ERROR: {str(e)}")
+        print(f"🔍 DEBUG: Original inferred filters keys: {list(inferred_filters.keys()) if isinstance(inferred_filters, dict) else 'Not a dict'}")
+        print(f"🔍 DEBUG: Original inferred filters content: {inferred_filters}")
+        print(f"🔍 DEBUG: Validated filters keys: {list(validated_filters.keys()) if isinstance(validated_filters, dict) else 'Not a dict'}")
+        print(f"🔍 DEBUG: Validated filters content: {validated_filters}")
+        # Return a safe fallback - no filter (search all documents)
+        return None, {}
+class MetadataCache:
+    """Cache for vectorstore metadata to avoid repeated queries."""
+    def __init__(self):
+        self._cache = None
+        self._last_updated = None
+        self._cache_ttl = 3600  # 1 hour TTL
+    def get_metadata(self, vectorstore) -> dict:
+        """
+        Get metadata from cache or load it if not available/expired.
+        Args:
+            vectorstore: QdrantVectorStore instance
+        Returns:
+            Dictionary of available metadata values
+        """
+        import time
+        # Check if cache is valid
+        if (self._cache is not None and
+            self._last_updated is not None and
+            time.time() - self._last_updated < self._cache_ttl):
+            print(f"✅ METADATA CACHE: Using cached metadata")
+            return self._cache
+        try:
+            print(f"🔄 METADATA CACHE: Loading metadata from vectorstore...")
+            # Get collection info
+            try:
+                collection_info = vectorstore._client.get_collection(vectorstore.collection_name)
+                print(f"✅ Collection info retrieved: {getattr(collection_info, 'name', 'unknown')}")
+            except Exception as e:
+                print(f"⚠️ Could not get collection info: {e}")
+            # Get ALL documents to extract complete metadata
+            print(f"📄 Scanning entire corpus for complete metadata extraction...")
+            # Get collection info to determine total size
+            try:
+                collection_info = vectorstore._client.get_collection(vectorstore.collection_name)
+                total_points = getattr(collection_info, 'points_count', 0)
+                print(f"📊 Total documents in corpus: {total_points}")
+            except Exception as e:
+                print(f"⚠️ Could not get collection size: {e}")
+                total_points = 0
+            # Extract unique metadata values from ALL documents
+            sources = set()
+            years = set()
+            filenames = set()
+            # Try to use scroll to get all documents in batches
+            batch_size = 1000  # Process in batches to avoid memory issues
+            offset = None
+            processed_count = 0
+            scroll_success = False
+            try:
+                while True:
+                    # Scroll through all documents
+                    scroll_result = vectorstore._client.scroll(
+                        collection_name=vectorstore.collection_name,
+                        limit=batch_size,
+                        offset=offset,
+                        with_payload=True,
+                        with_vectors=False  # We only need metadata
+                    )
+                    points = scroll_result[0]  # Get the points
+                    if not points:
+                        break  # No more documents
+                    # Process each document
+                    for i, point in enumerate(points):
+                        if hasattr(point, 'payload') and point.payload:
+                            payload = point.payload
+                            # Debug: Log structure of first few documents
+                            if processed_count + i < 2:  # Only log first 2 documents
+                                print(f"🔍 DEBUG Document {processed_count + i + 1} payload structure:")
+                                print(f"   Payload keys: {list(payload.keys()) if isinstance(payload, dict) else 'Not a dict'}")
+                                if isinstance(payload, dict) and 'metadata' in payload:
+                                    print(f"   Metadata keys: {list(payload['metadata'].keys()) if isinstance(payload['metadata'], dict) else 'Not a dict'}")
+                                elif isinstance(payload, dict):
+                                    print(f"   Top-level keys: {list(payload.keys())}")
+                                print(f"   Payload type: {type(payload)}")
+                                print(f"   Payload sample: {str(payload)[:200]}...")
+                                print()
+                            # Try different metadata structures
+                            found_metadata = False
+                            # Structure 1: payload['metadata']['source']
+                            if isinstance(payload, dict) and 'metadata' in payload:
+                                metadata = payload['metadata']
+                                if isinstance(metadata, dict):
+                                    if 'source' in metadata:
+                                        sources.add(metadata['source'])
+                                        found_metadata = True
+                                    if 'year' in metadata:
+                                        years.add(metadata['year'])
+                                        found_metadata = True
+                                    if 'filename' in metadata:
+                                        filenames.add(metadata['filename'])
+                                        found_metadata = True
+                            # Structure 2: payload['source'] (direct)
+                            if isinstance(payload, dict):
+                                if 'source' in payload:
+                                    sources.add(payload['source'])
+                                    found_metadata = True
+                                if 'year' in payload:
+                                    years.add(payload['year'])
+                                    found_metadata = True
+                                if 'filename' in payload:
+                                    filenames.add(payload['filename'])
+                                    found_metadata = True
+                            # Structure 3: Check for nested structures
+                            if not found_metadata and isinstance(payload, dict):
+                                # Look for any nested dict that might contain metadata
+                                for key, value in payload.items():
+                                    if isinstance(value, dict):
+                                        if 'source' in value:
+                                            sources.add(value['source'])
+                                            found_metadata = True
+                                        if 'year' in value:
+                                            years.add(value['year'])
+                                            found_metadata = True
+                                        if 'filename' in value:
+                                            filenames.add(value['filename'])
+                                            found_metadata = True
+                    processed_count += len(points)
+                    progress_pct = (processed_count / total_points * 100) if total_points > 0 else 0
+                    print(f"📄 Processed {processed_count}/{total_points} documents ({progress_pct:.1f}%)... (sources: {len(sources)}, years: {len(years)}, filenames: {len(filenames)})")
+                    # Update offset for next batch
+                    offset = scroll_result[1]  # Next offset
+                    if offset is None:
+                        break  # No more documents
+                scroll_success = True
+                print(f"✅ Scroll method successful - processed {processed_count} documents")
+            except Exception as e:
+                print(f"❌ Scroll method failed: {e}")
+                print(f"🔄 Falling back to similarity search method...")
+                # Fallback: Use similarity search with multiple queries to get more coverage
+                fallback_queries = [
+                    "",  # Empty query
+                    "audit", "report", "government", "ministry", "department",
+                    "local", "consolidated", "annual", "financial", "budget",
+                    "2020", "2021", "2022", "2023", "2024"  # Year queries
+                ]
+                processed_count = 0
+                for query in fallback_queries:
+                    try:
+                        # Get documents for this query
+                        docs = vectorstore.similarity_search(query, k=1000)  # Get more per query
+                        for j, doc in enumerate(docs):
+                            if hasattr(doc, 'metadata') and doc.metadata:
+                                # Debug: Log structure of first few documents in fallback
+                                if processed_count + j < 3:  # Only log first 3 documents per query
+                                    print(f"🔍 DEBUG Fallback Document {processed_count + j + 1} (query: '{query}') metadata structure:")
+                                    print(f"   Metadata keys: {list(doc.metadata.keys()) if isinstance(doc.metadata, dict) else 'Not a dict'}")
+                                    print(f"   Metadata type: {type(doc.metadata)}")
+                                    print(f"   Metadata sample: {str(doc.metadata)[:200]}...")
+                                    print()
+                                if 'source' in doc.metadata:
+                                    sources.add(doc.metadata['source'])
+                                if 'year' in doc.metadata:
+                                    years.add(doc.metadata['year'])
+                                if 'filename' in doc.metadata:
+                                    filenames.add(doc.metadata['filename'])
+                        processed_count += len(docs)
+                        print(f"📄 Fallback query '{query}': {len(docs)} docs (total: {processed_count}, sources: {len(sources)}, years: {len(years)}, filenames: {len(filenames)})")
+                    except Exception as query_error:
+                        print(f"⚠️ Fallback query '{query}' failed: {query_error}")
+                        continue
+                print(f"✅ Fallback method completed - processed {processed_count} documents")
+            print(f"✅ Completed scanning {processed_count} documents from entire corpus")
+            # Convert to sorted lists
+            metadata = {
+                'sources': sorted(list(sources)),
+                'years': sorted(list(years)),
+                'filenames': sorted(list(filenames))
+            }
+            # Cache the results
+            self._cache = metadata
+            self._last_updated = time.time()
+            print(f"✅ Complete metadata extracted from entire corpus: {len(sources)} sources, {len(years)} years, {len(filenames)} files")
+            # Debug: Show what was actually found
+            if sources:
+                print(f"📁 Sources found: {sorted(list(sources))}")
+            else:
+                print(f"❌ No sources found - check metadata structure")
+            if years:
+                print(f"📅 Years found: {sorted(list(years))}")
+            else:
+                print(f"❌ No years found - check metadata structure")
+            if filenames:
+                print(f"📄 Filenames found: {sorted(list(filenames))[:10]}{'...' if len(filenames) > 10 else ''}")
+            else:
+                print(f"❌ No filenames found - check metadata structure")
+            return metadata
+        except Exception as e:
+            print(f"❌ Error extracting metadata: {e}")
+            return {'sources': [], 'years': [], 'filenames': []}
+# Global metadata cache
+_metadata_cache = MetadataCache()
+def get_available_metadata(vectorstore) -> dict:
+    """Get available metadata values from the vectorstore efficiently."""
+    return _metadata_cache.get_metadata(vectorstore)

src/retrieval/hybrid.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""Hybrid search implementation combining vector and sparse retrieval."""
+import json
+import numpy as np
+from typing import List, Dict, Any, Tuple
+from pathlib import Path
+from langchain.docstore.document import Document
+from langchain_qdrant import QdrantVectorStore
+from langchain_community.retrievers import BM25Retriever
+from .filter import create_filter
+import pickle
+import os
+class HybridRetriever:
+    """
+    Hybrid retrieval system combining vector search (dense) and BM25 (sparse) search.
+    Supports configurable search modes: vector_only, sparse_only, or hybrid.
+    """
+    def __init__(self, config: Dict[str, Any]):
+        """
+        Initialize hybrid retriever.
+        Args:
+            config: Configuration dictionary with hybrid search settings
+        """
+        self.config = config
+        self.bm25_retriever = None
+        self.documents = []
+        self._bm25_cache_file = None
+    def _get_bm25_cache_path(self) -> str:
+        """Get path for BM25 cache file."""
+        cache_dir = Path("cache/bm25")
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        return str(cache_dir / "bm25_retriever.pkl")
+    def initialize_bm25(self, documents: List[Document], force_rebuild: bool = False) -> None:
+        """
+        Initialize BM25 retriever with documents.
+        Args:
+            documents: List of Document objects to index
+            force_rebuild: Whether to force rebuilding the BM25 index
+        """
+        self.documents = documents
+        self._bm25_cache_file = self._get_bm25_cache_path()
+        # Try to load cached BM25 retriever
+        if not force_rebuild and os.path.exists(self._bm25_cache_file):
+            try:
+                print("Loading cached BM25 retriever...")
+                with open(self._bm25_cache_file, 'rb') as f:
+                    self.bm25_retriever = pickle.load(f)
+                print(f"✅ Loaded cached BM25 retriever with {len(self.documents)} documents")
+                return
+            except Exception as e:
+                print(f"⚠️ Failed to load cached BM25 retriever: {e}")
+                print("Building new BM25 index...")
+        # Build new BM25 retriever
+        print("Building BM25 index...")
+        try:
+            # Use langchain's BM25Retriever
+            self.bm25_retriever = BM25Retriever.from_documents(documents)
+            # Configure BM25 parameters
+            bm25_config = self.config.get("bm25", {})
+            k = bm25_config.get("top_k", 20)
+            self.bm25_retriever.k = k
+            # Cache the BM25 retriever
+            with open(self._bm25_cache_file, 'wb') as f:
+                pickle.dump(self.bm25_retriever, f)
+            print(f"✅ Built and cached BM25 retriever with {len(documents)} documents")
+        except Exception as e:
+            print(f"❌ Failed to build BM25 retriever: {e}")
+            print("BM25 search will be disabled")
+            self.bm25_retriever = None
+    def _filter_documents_by_metadata(
+        self,
+        documents: List[Document],
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Document]:
+        """
+        Filter documents by metadata criteria.
+        Args:
+            documents: List of documents to filter
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            Filtered list of documents
+        """
+        if not any([reports, sources, subtype, year]):
+            return documents
+        filtered_docs = []
+        for doc in documents:
+            metadata = doc.metadata
+            # Filter by reports
+            if reports:
+                filename = metadata.get('filename', '')
+                if not any(report in filename for report in reports):
+                    continue
+            # Filter by sources
+            if sources:
+                doc_source = metadata.get('source', '')
+                if sources != doc_source:
+                    continue
+            # Filter by subtype
+            if subtype:
+                doc_subtype = metadata.get('subtype', '')
+                if doc_subtype not in subtype:
+                    continue
+            # Filter by year
+            if year:
+                doc_year = str(metadata.get('year', ''))
+                if doc_year not in year:
+                    continue
+            filtered_docs.append(doc)
+        return filtered_docs
+    def _bm25_search(
+        self,
+        query: str,
+        k: int = 20,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Perform BM25 sparse search.
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            List of (Document, score) tuples
+        """
+        if not self.bm25_retriever:
+            print("⚠️ BM25 retriever not available")
+            return []
+        try:
+            # Get BM25 results
+            self.bm25_retriever.k = k
+            bm25_docs = self.bm25_retriever.invoke(query)
+            # Apply metadata filtering
+            if any([reports, sources, subtype, year]):
+                bm25_docs = self._filter_documents_by_metadata(
+                    bm25_docs, reports, sources, subtype, year
+                )
+            # BM25Retriever doesn't return scores directly, so we'll use placeholder scores
+            # In a production system, you'd want to access the actual BM25 scores
+            results = []
+            for i, doc in enumerate(bm25_docs):
+                # Assign decreasing scores based on rank (higher rank = higher score)
+                # Normalize to [0, 1] range for consistency with vector search
+                score = max(0.1, 1.0 - (i / max(len(bm25_docs), 1)))
+                results.append((doc, score))
+            return results
+        except Exception as e:
+            print(f"❌ BM25 search failed: {e}")
+            return []
+    def _vector_search(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        k: int = 20,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Perform vector similarity search.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            k: Number of documents to retrieve
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            List of (Document, score) tuples
+        """
+        try:
+            # Create filter
+            filter_obj = create_filter(
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                year=year
+            )
+            # Perform vector search
+            if filter_obj:
+                results = vectorstore.similarity_search_with_score(
+                    query, k=k, filter=filter_obj
+                )
+            else:
+                results = vectorstore.similarity_search_with_score(query, k=k)
+            return results
+        except Exception as e:
+            print(f"❌ Vector search failed: {e}")
+            return []
+    def _normalize_scores(self, results: List[Tuple[Document, float]], method: str = "min_max") -> List[Tuple[Document, float]]:
+        """
+        Normalize scores to [0, 1] range.
+        Args:
+            results: List of (Document, score) tuples
+            method: Normalization method ('min_max' or 'z_score')
+        Returns:
+            List of (Document, normalized_score) tuples
+        """
+        if not results:
+            return results
+        scores = [score for _, score in results]
+        if method == "min_max":
+            min_score = min(scores)
+            max_score = max(scores)
+            if max_score == min_score:
+                normalized_results = [(doc, 1.0) for doc, _ in results]
+            else:
+                normalized_results = [
+                    (doc, (score - min_score) / (max_score - min_score))
+                    for doc, score in results
+                ]
+        elif method == "z_score":
+            mean_score = np.mean(scores)
+            std_score = np.std(scores)
+            if std_score == 0:
+                normalized_results = [(doc, 1.0) for doc, _ in results]
+            else:
+                normalized_results = [
+                    (doc, max(0, (score - mean_score) / std_score))
+                    for doc, score in results
+                ]
+        else:
+            normalized_results = results
+        return normalized_results
+    def _combine_results(
+        self,
+        vector_results: List[Tuple[Document, float]],
+        bm25_results: List[Tuple[Document, float]],
+        alpha: float = 0.5
+    ) -> List[Tuple[Document, float]]:
+        """
+        Combine vector and BM25 results with weighted scoring.
+        Args:
+            vector_results: Vector search results
+            bm25_results: BM25 search results
+            alpha: Weight for vector scores (1-alpha for BM25 scores)
+        Returns:
+            Combined and ranked results
+        """
+        # Normalize scores
+        vector_results = self._normalize_scores(vector_results)
+        bm25_results = self._normalize_scores(bm25_results)
+        # Create document ID mapping for both result sets
+        vector_docs = {id(doc): (doc, score) for doc, score in vector_results}
+        bm25_docs = {id(doc): (doc, score) for doc, score in bm25_results}
+        # Combine scores
+        combined_scores = {}
+        all_doc_ids = set(vector_docs.keys()) | set(bm25_docs.keys())
+        for doc_id in all_doc_ids:
+            vector_score = vector_docs.get(doc_id, (None, 0.0))[1]
+            bm25_score = bm25_docs.get(doc_id, (None, 0.0))[1]
+            # Weighted combination
+            combined_score = alpha * vector_score + (1 - alpha) * bm25_score
+            # Get document object
+            doc = vector_docs.get(doc_id, bm25_docs.get(doc_id))[0]
+            combined_scores[doc_id] = (doc, combined_score)
+        # Sort by combined score (descending)
+        sorted_results = sorted(
+            combined_scores.values(),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        return sorted_results
+    def retrieve(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        mode: str = "hybrid",
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None,
+        alpha: float = 0.5,
+        k: int = None
+    ) -> List[Document]:
+        """
+        Retrieve documents using the specified search mode.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+            alpha: Weight for vector scores in hybrid mode (0.5 = equal weight)
+            k: Number of documents to retrieve
+        Returns:
+            List of relevant Document objects
+        """
+        if k is None:
+            k = self.config.get("retriever", {}).get("top_k", 20)
+        results = []
+        if mode == "vector_only":
+            # Vector search only
+            vector_results = self._vector_search(
+                vectorstore, query, k, reports, sources, subtype, year
+            )
+            results = [(doc, score) for doc, score in vector_results]
+        elif mode == "sparse_only":
+            # BM25 search only
+            bm25_results = self._bm25_search(
+                query, k, reports, sources, subtype, year
+            )
+            results = [(doc, score) for doc, score in bm25_results]
+        elif mode == "hybrid":
+            # Hybrid search - combine both
+            # Get more results from each method to have better fusion
+            retrieval_k = min(k * 2, 50)  # Get more candidates for fusion
+            vector_results = self._vector_search(
+                vectorstore, query, retrieval_k, reports, sources, subtype, year
+            )
+            bm25_results = self._bm25_search(
+                query, retrieval_k, reports, sources, subtype, year
+            )
+            results = self._combine_results(vector_results, bm25_results, alpha)
+        else:
+            raise ValueError(f"Unknown search mode: {mode}")
+        # Limit to top k results
+        results = results[:k]
+        # Return just the documents
+        return [doc for doc, score in results]
+    def retrieve_with_scores(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        mode: str = "hybrid",
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None,
+        alpha: float = 0.5,
+        k: int = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Retrieve documents with scores using the specified search mode.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+            alpha: Weight for vector scores in hybrid mode (0.5 = equal weight)
+            k: Number of documents to retrieve
+        Returns:
+            List of (Document, score) tuples
+        """
+        if k is None:
+            k = self.config.get("retriever", {}).get("top_k", 20)
+        results = []
+        if mode == "vector_only":
+            # Vector search only
+            results = self._vector_search(
+                vectorstore, query, k, reports, sources, subtype, year
+            )
+        elif mode == "sparse_only":
+            # BM25 search only
+            results = self._bm25_search(
+                query, k, reports, sources, subtype, year
+            )
+        elif mode == "hybrid":
+            # Hybrid search - combine both
+            # Get more results from each method to have better fusion
+            retrieval_k = min(k * 2, 50)  # Get more candidates for fusion
+            vector_results = self._vector_search(
+                vectorstore, query, retrieval_k, reports, sources, subtype, year
+            )
+            bm25_results = self._bm25_search(
+                query, retrieval_k, reports, sources, subtype, year
+            )
+            results = self._combine_results(vector_results, bm25_results, alpha)
+        else:
+            raise ValueError(f"Unknown search mode: {mode}")
+        # Limit to top k results
+        return results[:k]
+def get_available_search_modes() -> List[str]:
+    """Get list of available search modes."""
+    return ["vector_only", "sparse_only", "hybrid"]
+def get_search_mode_description() -> Dict[str, str]:
+    """Get descriptions for each search mode."""
+    return {
+        "vector_only": "Semantic search using dense embeddings - good for conceptual matching",
+        "sparse_only": "Keyword search using BM25 - good for exact term matching",
+        "hybrid": "Combined semantic and keyword search - balanced approach"
+    }

src/vectorstore.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""Vector store management and operations."""
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import torch
+from langchain_qdrant import QdrantVectorStore
+from langchain.docstore.document import Document
+from langchain_core.embeddings import Embeddings
+from sentence_transformers import SentenceTransformer
+from langchain_huggingface import HuggingFaceEmbeddings
+class MatryoshkaEmbeddings(Embeddings):
+    """Custom embeddings class that supports Matryoshka dimension truncation."""
+    def __init__(self, model_name: str, truncate_dim: int = None, **kwargs):
+        """
+        Initialize Matryoshka embeddings.
+        Args:
+            model_name: Name of the model
+            truncate_dim: Dimension to truncate to (for Matryoshka models)
+            **kwargs: Additional arguments (ignored for Matryoshka models)
+        """
+        self.model_name = model_name
+        self.truncate_dim = truncate_dim
+        if truncate_dim and "matryoshka" in model_name.lower():
+            # Use SentenceTransformer directly for Matryoshka models
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model = SentenceTransformer(model_name, truncate_dim=truncate_dim, device=device)
+            print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
+        else:
+            # Use standard HuggingFaceEmbeddings
+            self.model = HuggingFaceEmbeddings(model_name=model_name, **kwargs)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed documents."""
+        if self.truncate_dim and "matryoshka" in self.model_name.lower():
+            embeddings = self.model.encode(texts, normalize_embeddings=True)
+            return embeddings.tolist()
+        else:
+            return self.model.embed_documents(texts)
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query."""
+        if self.truncate_dim and "matryoshka" in self.model_name.lower():
+            embedding = self.model.encode([text], normalize_embeddings=True)
+            return embedding[0].tolist()
+        else:
+            return self.model.embed_query(text)
+class VectorStoreManager:
+    """Manages vector store operations and connections."""
+    def __init__(self, config: Dict[str, Any]):
+        """
+        Initialize vector store manager.
+        Args:
+            config: Configuration dictionary
+        """
+        self.config = config
+        self.embeddings = self._create_embeddings()
+        self.vectorstore = None
+        # Define metadata fields that need payload indexes for filtering
+        self.metadata_fields = [
+            ("metadata.year", "keyword"),
+            ("metadata.source", "keyword"),
+            ("metadata.filename", "keyword"),
+            # Add more metadata fields as needed
+        ]
+    def _create_embeddings(self) -> HuggingFaceEmbeddings:
+        """Create embeddings model from configuration."""
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_name = self.config["retriever"]["model"]
+        normalize = self.config["retriever"]["normalize"]
+        model_kwargs = {"device": device}
+        encode_kwargs = {
+            "normalize_embeddings": normalize,
+            "batch_size": 100,
+        }
+        # For Matryoshka models, check if we need to truncate dimensions
+        if "matryoshka" in model_name.lower():
+            # Check if we have a specific dimension requirement
+            collection_name = self.config.get("qdrant", {}).get("collection_name", "")
+            if "modernbert-embed-base-akryl-matryoshka" in collection_name:
+                # This collection expects 768 dimensions
+                truncate_dim = 768
+                print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
+                # Use custom MatryoshkaEmbeddings
+                embeddings = MatryoshkaEmbeddings(
+                    model_name=model_name,
+                    truncate_dim=truncate_dim,
+                    model_kwargs=model_kwargs,
+                    encode_kwargs=encode_kwargs,
+                    show_progress=True,
+                )
+                return embeddings
+        # Use standard HuggingFaceEmbeddings for non-Matryoshka models
+        embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs,
+            show_progress=True,
+        )
+        return embeddings
+    def ensure_metadata_indexes(self) -> None:
+        """
+        Create payload indexes for all required metadata fields.
+        This ensures filtering works properly, especially in Qdrant Cloud.
+        """
+        if not self.vectorstore:
+            return
+        qdrant_config = self.config["qdrant"]
+        collection_name = qdrant_config["collection_name"]
+        for field_name, field_type in self.metadata_fields:
+            try:
+                self.vectorstore.client.create_payload_index(
+                    collection_name=collection_name,
+                    field_name=field_name,
+                    field_type=field_type
+                )
+                print(f"Created payload index for {field_name} ({field_type})")
+            except Exception as e:
+                # Index might already exist or other error - log but continue
+                print(f"Index creation for {field_name} ({field_type}): {str(e)}")
+    def connect_to_existing(self, force_recreate: bool = False) -> QdrantVectorStore:
+        """
+        Connect to existing Qdrant collection.
+        Args:
+            force_recreate: If True, recreate the collection if dimension mismatch occurs
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        kwargs_qdrant = {
+            "url": qdrant_config["url"],
+            "collection_name": qdrant_config["collection_name"],
+            "prefer_grpc": qdrant_config.get("prefer_grpc", True),
+            "api_key": qdrant_config.get("api_key", None),
+        }
+        if force_recreate:
+            kwargs_qdrant["force_recreate"] = True
+        self.vectorstore = QdrantVectorStore.from_existing_collection(
+            embedding=self.embeddings,
+            **kwargs_qdrant
+        )
+        # Ensure payload indexes exist for metadata filtering
+        self.ensure_metadata_indexes()
+        return self.vectorstore
+    def create_from_documents(self, documents: List[Document]) -> QdrantVectorStore:
+        """
+        Create new Qdrant collection from documents.
+        Args:
+            documents: List of Document objects
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        kwargs_qdrant = {
+            "url": qdrant_config["url"],
+            "collection_name": qdrant_config["collection_name"],
+            "prefer_grpc": qdrant_config.get("prefer_grpc", True),
+            "api_key": qdrant_config.get("api_key", None),
+        }
+        self.vectorstore = QdrantVectorStore.from_documents(
+            documents=documents,
+            embedding=self.embeddings,
+            **kwargs_qdrant
+        )
+        # Ensure payload indexes exist for metadata filtering
+        self.ensure_metadata_indexes()
+        return self.vectorstore
+    def delete_collection(self) -> None:
+        """
+        Delete the current Qdrant collection.
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        collection_name = qdrant_config.get("collection_name")
+        self.vectorstore.client.delete_collection(
+            collection_name=collection_name
+        )
+        return self.vectorstore
+    def get_vectorstore(self) -> Optional[QdrantVectorStore]:
+        """Get current vectorstore instance."""
+        return self.vectorstore
+def get_local_qdrant(config: Dict[str, Any]) -> QdrantVectorStore:
+    """
+    Get local Qdrant vector store (legacy function for compatibility).
+    Args:
+        config: Configuration dictionary
+    Returns:
+        QdrantVectorStore instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.connect_to_existing()
+def create_vectorstore(config: Dict[str, Any], documents: List[Document]) -> QdrantVectorStore:
+    """
+    Create new vector store from documents.
+    Args:
+        config: Configuration dictionary
+        documents: List of Document objects
+    Returns:
+        QdrantVectorStore instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.create_from_documents(documents)
+def get_embeddings_model(config: Dict[str, Any]) -> HuggingFaceEmbeddings:
+    """
+    Create embeddings model from configuration (legacy function).
+    Args:
+        config: Configuration dictionary
+    Returns:
+        HuggingFaceEmbeddings instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.embeddings

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import json
+import dataclasses
+from uuid import UUID
+from typing import Any
+from datetime import datetime, date
+import configparser
+from torch import cuda
+from qdrant_client.http import models as rest
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.cross_encoders import HuggingFaceCrossEncoder
+def get_config(fp):
+    config = configparser.ConfigParser()
+    config.read_file(open(fp))
+    return config
+def get_embeddings_model(config):
+    device = "cuda" if cuda.is_available() else "cpu"
+    # Define embedding model
+    model_name = config.get("retriever", "MODEL")
+    model_kwargs = {"device": device}
+    normalize_embeddings = bool(int(config.get("retriever", "NORMALIZE")))
+    encode_kwargs = {
+        "normalize_embeddings": normalize_embeddings,
+        "batch_size": 100,
+    }
+    embeddings = HuggingFaceEmbeddings(
+        show_progress=True,
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs,
+    )
+    return embeddings
+# Create a search filter for Qdrant
+def create_filter(
+    reports: list = [], sources: str = None, subtype: str = None, year: str = None
+):
+    if len(reports) == 0:
+        print(f"defining filter for sources:{sources}, subtype:{subtype}")
+        filter = rest.Filter(
+            must=[
+                rest.FieldCondition(
+                    key="metadata.source", match=rest.MatchValue(value=sources)
+                ),
+                rest.FieldCondition(
+                    key="metadata.filename", match=rest.MatchAny(any=subtype)
+                ),
+                # rest.FieldCondition(
+                #    key="metadata.year",
+                #    match=rest.MatchAny(any=year)
+            ]
+        )
+    else:
+        print(f"defining filter for allreports:{reports}")
+        filter = rest.Filter(
+            must=[
+                rest.FieldCondition(
+                    key="metadata.filename", match=rest.MatchAny(any=reports)
+                )
+            ]
+        )
+    return filter
+def load_json(fp):
+    with open(fp, "r") as f:
+        docs = json.load(f)
+    return docs
+def get_timestamp():
+    now = datetime.datetime.now()
+    timestamp = now.strftime("%Y%m%d%H%M%S")
+    return timestamp
+# A custom class to help with recursive serialization.
+# This approach avoids modifying the original object.
+class _RecursiveSerializer(json.JSONEncoder):
+    """A custom JSONEncoder that handles complex types by converting them to dicts or strings."""
+    def default(self, obj):
+        # Prefer the pydantic method if it exists for the most robust serialization.
+        if hasattr(obj, 'model_dump'):
+            return obj.model_dump()
+        # Handle dataclasses
+        if dataclasses.is_dataclass(obj):
+            return dataclasses.asdict(obj)
+        # Handle other non-serializable but common types.
+        if isinstance(obj, (datetime, date, UUID)):
+            return str(obj)
+        # Fallback for general objects with a __dict__
+        if hasattr(obj, '__dict__'):
+            return obj.__dict__
+        # Default fallback to JSONEncoder's behavior
+        return super().default(obj)
+def to_json_string(obj: Any, **kwargs) -> str:
+    """
+    Serializes a Python object into a JSON-formatted string.
+    This function is a comprehensive utility that can handle:
+    - Standard Python types (lists, dicts, strings, numbers, bools, None).
+    - Pydantic models (using `model_dump()`).
+    - Dataclasses (using `dataclasses.asdict()`).
+    - Standard library types not natively JSON-serializable (e.g., datetime, UUID).
+    - Custom classes with a `__dict__`.
+    Args:
+        obj (Any): The Python object to serialize.
+        **kwargs: Additional keyword arguments to pass to `json.dumps`.
+    Returns:
+        str: A JSON-formatted string.
+    Example:
+        >>> from datetime import datetime
+        >>> from pydantic import BaseModel
+        >>> from dataclasses import dataclass
+        >>> class Address(BaseModel):
+        ...     street: str
+        ...     city: str
+        >>> @dataclass
+        ... class Product:
+        ...     id: int
+        ...     name: str
+        >>> class Order(BaseModel):
+        ...     user_address: Address
+        ...     item: Product
+        >>> order_obj = Order(
+        ...     user_address=Address(street="123 Main St", city="Example City"),
+        ...     item=Product(id=1, name="Laptop")
+        ... )
+        >>> print(to_json_string(order_obj, indent=2))
+        {
+          "user_address": {
+            "street": "123 Main St",
+            "city": "Example City"
+          },
+          "item": {
+            "id": 1,
+            "name": "Laptop"
+          }
+        }
+    """
+    return json.dumps(obj, cls=_RecursiveSerializer, **kwargs)