Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

LiamKhoaLe commited on Sep 24

Commit

367abb6

1 Parent(s): 01ac218

Optimise chats and reports route concurrency

Browse files

Files changed (2) hide show

routes/chats.py +84 -43
routes/reports.py +83 -81

routes/chats.py CHANGED Viewed

@@ -203,6 +203,7 @@ async def delete_all_chat_history(user_id: str):
 # In-memory status tracking for real-time updates
 chat_status_store = {}
 @app.get("/chat/status/{session_id}", response_model=StatusUpdateResponse)
 async def get_chat_status(session_id: str):
@@ -349,19 +350,23 @@ async def chat(
                 rag.db["chat_sessions"].insert_one(session_data)
                 logger.info(f"[CHAT] Created session record for {session_id}")
-            # If this is the first user message, trigger auto-naming
             if existing_messages == 0:
                 try:
                     from helpers.namer import auto_name_session_immediate
-                    session_name = await auto_name_session_immediate(
-                        user_id, project_id, session_id, question, nvidia_rotator, rag.db
-                    )
-                    if session_name:
-                        logger.info(f"[CHAT] Auto-named session {session_id} to '{session_name}'")
-                    else:
-                        logger.warning(f"[CHAT] Auto-naming failed for session {session_id}")
                 except Exception as e:
-                    logger.warning(f"[CHAT] Auto-naming failed: {e}")
         # Get the chat response
         chat_response = await asyncio.wait_for(_chat_impl(user_id, project_id, question, k, use_web=use_web, max_web=max_web, session_id=session_id), timeout=120.0)
@@ -502,29 +507,55 @@ async def _chat_impl(
     # Use enhanced question for better query variations
     enhanced_queries = await _generate_query_variations(enhanced_question, nvidia_rotator)
     logger.info(f"[CHAT] Generated {len(enhanced_queries)} query variations")
     # Update status: Planning action (planning search strategy)
     if session_id:
         update_chat_status(session_id, "planning", "Planning action...", 25)
     all_hits = []
     search_strategies = ["flat", "hybrid", "local"]
     for strategy in search_strategies:
-        for query_variant in enhanced_queries:
-            q_vec = embedder.embed([query_variant])[0]
-            hits = rag.vector_search(
                 user_id=user_id,
                 project_id=project_id,
                 query_vector=q_vec,
                 k=k,
                 filenames=relevant_files if relevant_files else None,
                 search_type=strategy
-            )
-            if hits:
-                all_hits.extend(hits)
-                logger.info(f"[CHAT] {strategy} search with '{query_variant[:50]}...' returned {len(hits)} hits")
-                break
-        if all_hits:
-            break
     hits = _deduplicate_and_rank_hits(all_hits, question)
     logger.info(f"[CHAT] Final vector search returned {len(hits) if hits else 0} hits")
     if not hits:
@@ -682,38 +713,48 @@ async def _chat_impl(
         from memo.history import get_history_manager
         history_manager = get_history_manager(memory)
         qa_sum = await history_manager.summarize_qa_with_nvidia(question, answer, nvidia_rotator)
         # Use session-specific memory storage
         memory.add_session_memory(user_id, project_id, session_id, question, answer, {
             "relevant_files": relevant_files,
             "sources_count": len(sources_meta),
             "timestamp": time.time()
         })
         # Also add to global memory for backward compatibility
         memory.add(user_id, qa_sum)
-        if memory.is_enhanced_available():
-            await memory.add_conversation_memory(
-                user_id=user_id,
-                question=question,
-                answer=answer,
-                project_id=project_id,
-                session_id=session_id,  # Add session_id to enhanced memory
-                context={
-                    "relevant_files": relevant_files,
-                    "sources_count": len(sources_meta),
-                    "timestamp": time.time()
-                }
-            )
-            # Trigger memory consolidation if needed
             try:
-                consolidation_result = await memory.consolidate_memories(user_id, nvidia_rotator)
-                if consolidation_result.get("consolidated", 0) > 0:
-                    logger.info(f"[CHAT] Memory consolidated: {consolidation_result}")
-            except Exception as e:
-                logger.warning(f"[CHAT] Memory consolidation failed: {e}")
     except Exception as e:
         logger.warning(f"QA summarize/store failed: {e}")
     # Merge web sources if any (normalize to filename=url for frontend display)

 # In-memory status tracking for real-time updates
 chat_status_store = {}
+_embedding_cache = {}
 @app.get("/chat/status/{session_id}", response_model=StatusUpdateResponse)
 async def get_chat_status(session_id: str):
                 rag.db["chat_sessions"].insert_one(session_data)
                 logger.info(f"[CHAT] Created session record for {session_id}")
+            # If this is the first user message, trigger auto-naming (non-blocking)
             if existing_messages == 0:
                 try:
                     from helpers.namer import auto_name_session_immediate
+                    import asyncio as _asyncio_name
+                    async def _do_name():
+                        try:
+                            _name = await auto_name_session_immediate(
+                                user_id, project_id, session_id, question, nvidia_rotator, rag.db
+                            )
+                            if _name:
+                                logger.info(f"[CHAT] Auto-named session {session_id} to '{_name}'")
+                        except Exception as _e:
+                            logger.warning(f"[CHAT] Auto-naming failed: {_e}")
+                    _asyncio_name.create_task(_do_name())
                 except Exception as e:
+                    logger.warning(f"[CHAT] Auto-naming scheduling failed: {e}")
         # Get the chat response
         chat_response = await asyncio.wait_for(_chat_impl(user_id, project_id, question, k, use_web=use_web, max_web=max_web, session_id=session_id), timeout=120.0)
     # Use enhanced question for better query variations
     enhanced_queries = await _generate_query_variations(enhanced_question, nvidia_rotator)
     logger.info(f"[CHAT] Generated {len(enhanced_queries)} query variations")
     # Update status: Planning action (planning search strategy)
     if session_id:
         update_chat_status(session_id, "planning", "Planning action...", 25)
+    # Batch-embed all query variants once
+    # Simple per-session embedding cache
+    cache_key = f"{project_id}:{session_id or 'na'}:" + ("|".join(enhanced_queries))[:512]
+    try:
+        now_ts = time.time()
+        # Evict stale entries (older than 10 minutes) or keep cache under 128 items
+        if len(_embedding_cache) > 128:
+            for k in list(_embedding_cache.keys())[:32]:
+                _embedding_cache.pop(k, None)
+        if cache_key in _embedding_cache and (now_ts - _embedding_cache[cache_key][0] < 600):
+            query_vectors = _embedding_cache[cache_key][1]
+        else:
+            query_vectors = embedder.embed(enhanced_queries)
+            _embedding_cache[cache_key] = (now_ts, query_vectors)
+    except Exception as e:
+        logger.warning(f"[CHAT] Batch embedding failed, falling back per-query: {e}")
+        query_vectors = [embedder.embed([q])[0] for q in enhanced_queries]
+    # Run vector searches concurrently across strategies and query variants
     all_hits = []
     search_strategies = ["flat", "hybrid", "local"]
+    tasks = []
+    import asyncio as _asyncio
     for strategy in search_strategies:
+        for q_vec in query_vectors:
+            tasks.append(_asyncio.to_thread(
+                rag.vector_search,
                 user_id=user_id,
                 project_id=project_id,
                 query_vector=q_vec,
                 k=k,
                 filenames=relevant_files if relevant_files else None,
                 search_type=strategy
+            ))
+    try:
+        results = await _asyncio.gather(*tasks, return_exceptions=True)
+        for idx, res in enumerate(results):
+            if isinstance(res, Exception):
+                continue
+            if res:
+                all_hits.extend(res)
+        logger.info(f"[CHAT] Parallel search produced {len(all_hits)} raw hits across {len(tasks)} tasks")
+    except Exception as e:
+        logger.warning(f"[CHAT] Parallel search failed: {e}")
     hits = _deduplicate_and_rank_hits(all_hits, question)
     logger.info(f"[CHAT] Final vector search returned {len(hits) if hits else 0} hits")
     if not hits:
         from memo.history import get_history_manager
         history_manager = get_history_manager(memory)
         qa_sum = await history_manager.summarize_qa_with_nvidia(question, answer, nvidia_rotator)
         # Use session-specific memory storage
         memory.add_session_memory(user_id, project_id, session_id, question, answer, {
             "relevant_files": relevant_files,
             "sources_count": len(sources_meta),
             "timestamp": time.time()
         })
         # Also add to global memory for backward compatibility
         memory.add(user_id, qa_sum)
+        # Enhanced memory writes and consolidation deferred to background
+        async def _write_enhanced_and_consolidate():
             try:
+                if memory.is_enhanced_available():
+                    await memory.add_conversation_memory(
+                        user_id=user_id,
+                        question=question,
+                        answer=answer,
+                        project_id=project_id,
+                        session_id=session_id,
+                        context={
+                            "relevant_files": relevant_files,
+                            "sources_count": len(sources_meta),
+                            "timestamp": time.time()
+                        }
+                    )
+                    try:
+                        consolidation_result = await memory.consolidate_memories(user_id, nvidia_rotator)
+                        if consolidation_result.get("consolidated", 0) > 0:
+                            logger.info(f"[CHAT] Memory consolidated: {consolidation_result}")
+                    except Exception as ce:
+                        logger.warning(f"[CHAT] Memory consolidation failed: {ce}")
+            except Exception as we:
+                logger.warning(f"[CHAT] Enhanced memory write failed: {we}")
+        try:
+            import asyncio as _asyncio2
+            _asyncio2.create_task(_write_enhanced_and_consolidate())
+        except Exception:
+            # If scheduling fails, fall back to inline write
+            await _write_enhanced_and_consolidate()
     except Exception as e:
         logger.warning(f"QA summarize/store failed: {e}")
     # Merge web sources if any (normalize to filename=url for frontend display)

routes/reports.py CHANGED Viewed

@@ -425,12 +425,14 @@ async def execute_detailed_subtasks(cot_plan: Dict[str, Any], context_text: str,
     subsection_number = 1
     agent_context = {}  # Store context from previous agents for CoT references
-    for section in sections:
         section_title = section.get("title", "Unknown Section")
         section_priority = section.get("priority", "important")
-        # Assign section number (1, 2, 3, etc.)
-        section_id = f"{section_number}"
         section_analysis = {
             "section_id": section_id,
             "title": section_title,
@@ -438,94 +440,94 @@ async def execute_detailed_subtasks(cot_plan: Dict[str, Any], context_text: str,
             "priority": section_priority,
             "subtask_results": [],
             "section_synthesis": "",
-            "agent_context": agent_context.copy()  # Pass context from previous agents
         }
-        # Process each subtask with hierarchical subsection assignment
-        subtask_number = 1
-        for subtask in section.get("subtasks", []):
-            task = subtask.get("task", "")
-            reasoning = subtask.get("reasoning", "")
-            sources_needed = subtask.get("sources_needed", ["local"])
-            depth = subtask.get("depth", "detailed")
-            sub_actions = subtask.get("sub_actions", [])
-            expected_output = subtask.get("expected_output", "")
-            quality_checks = subtask.get("quality_checks", [])
-            # Assign subsection number (1.1, 1.2, 2.1, 2.2, etc.)
-            subsection_id = f"{section_number}.{subtask_number}"
-            # Generate comprehensive analysis with CoT references
-            subtask_result = await analyze_subtask_with_cot_references(
-                subsection_id, task, reasoning, sources_needed, depth, sub_actions,
-                expected_output, quality_checks, context_text, web_context, filename,
-                agent_context, nvidia_rotator, gemini_rotator
-            )
-            # If the subtask implies coding, generate code artifacts and explanations
-            if any(kw in (task.lower() + " " + reasoning.lower()) for kw in ["implement", "code", "function", "class", "api", "script", "module", "endpoint"]):
-                try:
-                    logger.info(f"[REPORT] Triggering code generation for {subsection_id}")
-                    code_markdown = await generate_code_artifacts(
-                        subsection_id=subsection_id,
-                        task=task,
-                        reasoning=reasoning,
-                        context_text=context_text,
-                        web_context=web_context,
-                        gemini_rotator=gemini_rotator,
-                        nvidia_rotator=nvidia_rotator
-                    )
-                    # Append code and explanation beneath the analysis
-                    subtask_result = subtask_result + "\n\n" + code_markdown
-                    # Parse structured code for indexing and downstream usage
                     try:
-                        code_blocks = extract_structured_code(code_markdown)
-                    except Exception as pe:
-                        logger.warning(f"[REPORT] Failed to parse structured code for {subsection_id}: {pe}")
-                        code_blocks = []
-                except Exception as ce:
-                    logger.warning(f"[REPORT] Code generation failed for {subsection_id}: {ce}")
-            # Store agent context for next agents
-            agent_context[f"{section_id}.{subtask_number}"] = {
-                "subsection_id": subsection_id,
-                "task": task,
-                "key_findings": extract_key_findings(subtask_result),
-                "evidence": extract_evidence(subtask_result),
-                "conclusions": extract_conclusions(subtask_result)
-            }
-            section_analysis["subtask_results"].append({
-                "subsection_id": subsection_id,
-                "task": task,
-                "reasoning": reasoning,
-                "depth": depth,
-                "sub_actions": sub_actions,
-                "expected_output": expected_output,
-                "quality_checks": quality_checks,
-                "analysis": subtask_result,
-                **({"code_blocks": code_blocks} if 'code_blocks' in locals() else {}),
-                "agent_context": agent_context.copy()
-            })
-            subtask_number += 1
-        # Generate section-level synthesis with cross-references
         section_synthesis = await synthesize_section_with_cot_references(
-            section_analysis, synthesis_strategy, agent_context, nvidia_rotator, gemini_rotator
         )
         section_analysis["section_synthesis"] = section_synthesis
-        # Update agent context with section-level insights
-        agent_context[f"section_{section_id}"] = {
             "section_id": section_id,
             "title": section_title,
             "key_insights": extract_key_insights(section_synthesis),
             "cross_references": extract_cross_references(section_synthesis)
         }
-        detailed_analysis[section_title] = section_analysis
         section_number += 1
     logger.info(f"[REPORT] Completed hierarchical analysis for {len(detailed_analysis)} sections with CoT references")
     return detailed_analysis

     subsection_number = 1
     agent_context = {}  # Store context from previous agents for CoT references
+    import asyncio as _asyncio
+    semaphore = _asyncio.Semaphore(4)  # limit concurrency to avoid provider rate limits
+    async def _process_section(section, section_number_local, agent_context_shared):
+        nonlocal subsection_number
         section_title = section.get("title", "Unknown Section")
         section_priority = section.get("priority", "important")
+        section_id = f"{section_number_local}"
         section_analysis = {
             "section_id": section_id,
             "title": section_title,
             "priority": section_priority,
             "subtask_results": [],
             "section_synthesis": "",
+            "agent_context": agent_context_shared.copy()
         }
+        async def _process_subtask(subtask, subtask_index):
+            async with semaphore:
+                task = subtask.get("task", "")
+                reasoning = subtask.get("reasoning", "")
+                sources_needed = subtask.get("sources_needed", ["local"])
+                depth = subtask.get("depth", "detailed")
+                sub_actions = subtask.get("sub_actions", [])
+                expected_output = subtask.get("expected_output", "")
+                quality_checks = subtask.get("quality_checks", [])
+                subsection_id = f"{section_number_local}.{subtask_index}"
+                subtask_result = await analyze_subtask_with_cot_references(
+                    subsection_id, task, reasoning, sources_needed, depth, sub_actions,
+                    expected_output, quality_checks, context_text, web_context, filename,
+                    agent_context_shared, nvidia_rotator, gemini_rotator
+                )
+                code_blocks = None
+                if any(kw in (task.lower() + " " + reasoning.lower()) for kw in ["implement", "code", "function", "class", "api", "script", "module", "endpoint"]):
                     try:
+                        logger.info(f"[REPORT] Triggering code generation for {subsection_id}")
+                        code_markdown = await generate_code_artifacts(
+                            subsection_id=subsection_id,
+                            task=task,
+                            reasoning=reasoning,
+                            context_text=context_text,
+                            web_context=web_context,
+                            gemini_rotator=gemini_rotator,
+                            nvidia_rotator=nvidia_rotator
+                        )
+                        subtask_result = subtask_result + "\n\n" + code_markdown
+                        try:
+                            code_blocks = extract_structured_code(code_markdown)
+                        except Exception as pe:
+                            logger.warning(f"[REPORT] Failed to parse structured code for {subsection_id}: {pe}")
+                            code_blocks = []
+                    except Exception as ce:
+                        logger.warning(f"[REPORT] Code generation failed for {subsection_id}: {ce}")
+                agent_context_shared[f"{section_id}.{subtask_index}"] = {
+                    "subsection_id": subsection_id,
+                    "task": task,
+                    "key_findings": extract_key_findings(subtask_result),
+                    "evidence": extract_evidence(subtask_result),
+                    "conclusions": extract_conclusions(subtask_result)
+                }
+                section_analysis["subtask_results"].append({
+                    "subsection_id": subsection_id,
+                    "task": task,
+                    "reasoning": reasoning,
+                    "depth": depth,
+                    "sub_actions": sub_actions,
+                    "expected_output": expected_output,
+                    "quality_checks": quality_checks,
+                    "analysis": subtask_result,
+                    **({"code_blocks": code_blocks} if code_blocks is not None else {}),
+                    "agent_context": agent_context_shared.copy()
+                })
+        subtask_tasks = []
+        subtask_index = 1
+        for subtask in section.get("subtasks", []):
+            subtask_tasks.append(_process_subtask(subtask, subtask_index))
+            subtask_index += 1
+        if subtask_tasks:
+            await _asyncio.gather(*subtask_tasks)
         section_synthesis = await synthesize_section_with_cot_references(
+            section_analysis, synthesis_strategy, agent_context_shared, nvidia_rotator, gemini_rotator
         )
         section_analysis["section_synthesis"] = section_synthesis
+        agent_context_shared[f"section_{section_id}"] = {
             "section_id": section_id,
             "title": section_title,
             "key_insights": extract_key_insights(section_synthesis),
             "cross_references": extract_cross_references(section_synthesis)
         }
+        return section_title, section_analysis
+    section_tasks = []
+    for section in sections:
+        section_tasks.append(_process_section(section, section_number, agent_context))
         section_number += 1
+    if section_tasks:
+        results = await _asyncio.gather(*section_tasks)
+        for title, analysis in results:
+            detailed_analysis[title] = analysis
     logger.info(f"[REPORT] Completed hierarchical analysis for {len(detailed_analysis)} sections with CoT references")
     return detailed_analysis