Spaces:

BinKhoaLe1812
/

Cooking_Tutor

Sleeping

App Files Files Community

LiamKhoaLe commited on 20 days ago

Commit

21446aa

0 Parent(s):

Init commit

Browse files

Files changed (44) hide show

.DS_Store +0 -0
.dockerignore +4 -0
.gitattributes +35 -0
.gitignore +2 -0
.huggingface.yml +4 -0
Dockerfile +45 -0
README.md +116 -0
api/README.md +85 -0
api/__init__.py +2 -0
api/app.py +49 -0
api/chatbot.py +202 -0
api/config.py +67 -0
api/retrieval.py +155 -0
api/routes.py +434 -0
main.py +13 -0
memory/__init__.py +2 -0
memory/memory.py +331 -0
models/__init__.py +3 -0
models/download_model.py +51 -0
models/llama.py +125 -0
models/summarizer.py +216 -0
models/warmup.py +8 -0
requirements.txt +25 -0
search/.DS_Store +0 -0
search/__init__.py +26 -0
search/coordinator.py +504 -0
search/engines/__init__.py +6 -0
search/engines/cooking.py +197 -0
search/engines/duckduckgo.py +599 -0
search/engines/multilingual.py +272 -0
search/engines/video.py +432 -0
search/extractors/__init__.py +3 -0
search/extractors/content.py +211 -0
search/processors/__init__.py +6 -0
search/processors/cooking.py +258 -0
search/processors/enhanced.py +331 -0
search/processors/language.py +266 -0
search/processors/sources.py +352 -0
search/search.py +362 -0
utils/__init__.py +4 -0
utils/migrate.py +54 -0
utils/symbipredict_2022.csv +0 -0
utils/translation.py +141 -0
utils/vlm.py +54 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+api/legacy.py
+*.md
+.env
+*yml

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ secrets.toml

.huggingface.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+sdk: docker
+app_file: app.py
+port: 7860
+hardware: cpu-basic

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+FROM python:3.11
+# Create and use a non-root user (optional)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy all project files to the container
+COPY . .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Verify CSV file is present and accessible
+RUN ls -la /app/utils/symbipredict_2022.csv || echo "CSV file not found"
+# Test CSV loading in Docker environment
+RUN python /app/test_docker_csv.py
+# Clean up test file
+RUN rm /app/test_docker_csv.py
+# Set Hugging Face cache directory to persist model downloads
+ENV HF_HOME="/home/user/.cache/huggingface"
+ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
+ENV MEDGEMMA_HOME="/home/user/.cache/huggingface/sentence-transformers"
+# Create cache directories and ensure permissions
+RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
+    chown -R user:user /app/model_cache /home/user/.cache/huggingface
+# Pre-load model in a separate script
+RUN python /app/models/download_model.py && python /app/models/warmup.py
+# Ensure ownership and permissions remain intact
+RUN chown -R user:user /app/model_cache
+# Expose port
+EXPOSE 7860
+# Run the application using main.py as entry point
+CMD ["python", "main.py"]

README.md ADDED Viewed

	@@ -0,0 +1,116 @@

+---
+title: Cooking Tutor API
+emoji: 👨‍🍳
+colorFrom: orange
+colorTo: red
+sdk: docker
+sdk_version: latest
+pinned: false
+license: apache-2.0
+short_description: Cooking Tutor with WebSearch, Memory, Multilingual
+---
+# Cooking Tutor Backend
+## At-a-glance
+Production-grade cooking assistant with web search integration, conversation memory, multilingual support, and comprehensive recipe guidance.
+## Key Features
+### 🔍 Web Search Integration
+- Curated cooking sources (AllRecipes, Food Network, Epicurious, etc.)
+- Content extraction and summarization
+- Citation mapping with clickable URLs
+- Cooking relevance filtering
+### 🧠 Memory & Retrieval
+- Conversation memory with FAISS indexing
+- Semantic chunking and summarization
+- Context builder for conversation continuity
+- Up to 20 recent summaries per user
+### 🌍 Multilingual Support
+- Vietnamese and Chinese translation
+- Language detection and query enhancement
+- Fallback handling for translation failures
+### 🍳 Cooking Focus
+- Specialized cooking keyword filtering
+- Recipe and technique guidance
+- Ingredient substitution suggestions
+- Cooking time and temperature guidance
+## Usage
+### Running the Application
+```bash
+# Using main entry point
+python main.py
+# Or directly
+python api/app.py
+```
+### Environment Variables
+- `FlashAPI` - Gemini API key (required)
+- `NVIDIA_URI` - Optional for advanced features
+- `NVIDIA_RERANK_ENDPOINT` - Optional reranker endpoint
+## API Endpoints
+### POST `/chat`
+Main chat endpoint with cooking guidance.
+**Request Body:**
+```json
+{
+  "query": "How to make perfect pasta?",
+  "lang": "EN",
+  "search": true,
+  "user_id": "unique_user_id",
+  "servings": 4,
+  "dietary": ["vegetarian"],
+  "skill_level": "beginner",
+  "structured": true
+}
+```
+**Response:**
+```json
+{
+  "response": "Cooking guidance with citations <URL>",
+  "response_time": "2.34s"
+}
+```
+## Search Mode Features
+When `search: true`:
+1. Search curated cooking sources
+2. Extract and summarize relevant content
+3. Filter by cooking relevance
+4. Provide citations with clickable URLs
+## Memory Features
+- **Conversation Continuity**: Maintains context across sessions
+- **Semantic Chunking**: Groups related cooking topics
+- **Usage Tracking**: Prioritizes frequently used information
+- **Time Decay**: Recent conversations get higher priority
+## Folders Overview
+- `api/` - FastAPI app, routes, chatbot orchestration
+- `models/` - Summarizer and processing models
+- `memory/` - Memory manager and FAISS interfaces
+- `search/` - Web search engines and processors
+- `utils/` - Translation and utility functions
+## Dependencies
+See `requirements.txt` for complete list. Key components:
+- `google-genai` - Gemini API integration
+- `faiss-cpu` - Vector similarity search
+- `sentence-transformers` - Text embeddings
+- `transformers` - Translation models
+- `requests` - Web search functionality
+- `beautifulsoup4` - HTML content extraction

api/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# API Module Structure - Cooking Tutor
+## 📁 **Module Overview**
+### **config.py** - Configuration Management
+- Environment variables validation
+- Logging configuration
+- System resource monitoring
+- Memory optimization settings
+- CORS configuration
+### **retrieval.py** - Web Search Integration
+- Cooking information retrieval via web search
+- Recipe suggestion system
+- Smart content filtering and relevance scoring
+- Web search result processing
+### **chatbot.py** - Core Chatbot Logic
+- CookingTutorChatbot class
+- Gemini API client
+- Web search integration
+- Citation processing
+- Memory management integration
+### **routes.py** - API Endpoints
+- `/chat` - Main chat endpoint
+- `/health` - Health check
+- `/` - Root endpoint with landing page
+- Request/response handling
+### **app.py** - Main Application
+- FastAPI app initialization
+- Middleware configuration
+- Route registration
+- Server startup
+## 🔄 **Data Flow**
+```
+Request → routes.py → chatbot.py → search.py (web search)
+                ↓
+         memory.py (context) + utils/ (translation)
+                ↓
+         models/ (summarization processing)
+                ↓
+         Response with citations
+```
+## 🚀 **Benefits of Modular Structure**
+1. **Separation of Concerns**: Each module has a single responsibility
+2. **Easier Testing**: Individual modules can be tested in isolation
+3. **Better Maintainability**: Changes to one module don't affect others
+4. **Improved Readability**: Smaller files are easier to understand
+5. **Reusability**: Modules can be imported and used elsewhere
+6. **Scalability**: Easy to add new features without affecting existing code
+## 📊 **File Sizes**
+| File | Lines | Purpose |
+|------|-------|---------|
+| **app.py** | 50 | Main app initialization |
+| **config.py** | 68 | Configuration |
+| **retrieval.py** | 156 | Web search integration |
+| **chatbot.py** | 203 | Chatbot logic |
+| **routes.py** | 435 | API endpoints |
+## 🔧 **Usage**
+The modular structure maintains clean API interface:
+```python
+# All imports work the same way
+from api.app import app
+from api.chatbot import CookingTutorChatbot
+from api.retrieval import retrieval_engine
+```
+## 🛠 **Development Benefits**
+- **Easier Debugging**: Issues can be isolated to specific modules
+- **Parallel Development**: Multiple developers can work on different modules
+- **Code Reviews**: Smaller files are easier to review
+- **Documentation**: Each module can have focused documentation
+- **Testing**: Unit tests can be written for each module independently

api/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # API package
2	+ # Main API endpoints and routes

api/app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# api/app_new.py
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .config import setup_logging, check_system_resources, optimize_memory, CORS_ORIGINS, validate_environment
+from .routes import router
+# ✅ Validate environment
+validate_environment()
+# ✅ Setup logging
+logger = setup_logging()
+logger.info("🍳 Starting Cooking Tutor API...")
+# ✅ Monitor system resources
+check_system_resources(logger)
+# ✅ Optimize memory usage
+optimize_memory()
+# ✅ Initialize FastAPI app
+app = FastAPI(
+    title="Cooking Tutor API",
+    description="AI-powered cooking lesson and recipe tutoring with web search",
+    version="1.0.0"
+)
+# ✅ Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=CORS_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# No database initialization required for cooking tutor (web-search only)
+# ✅ Include routes
+app.include_router(router)
+# ✅ Run Uvicorn
+if __name__ == "__main__":
+    logger.info("[System] ✅ Starting FastAPI Server...")
+    try:
+        uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
+    except Exception as e:
+        logger.error(f"❌ Server Startup Failed: {e}")
+        exit(1)

api/chatbot.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# api/chatbot.py
+import re
+import logging
+from typing import Dict
+from google import genai
+from .config import gemini_flash_api_key
+from memory import MemoryManager
+from utils import translate_query
+from search import search_comprehensive
+# Safety guard removed - cooking tutor doesn't need medical safety checks
+logger = logging.getLogger("cooking-tutor")
+class GeminiClient:
+    """Gemini API client for generating responses"""
+    def __init__(self):
+        self.client = genai.Client(api_key=gemini_flash_api_key)
+    def generate_content(self, prompt: str, model: str = "gemini-2.5-flash", temperature: float = 0.7) -> str:
+        """Generate content using Gemini API"""
+        try:
+            response = self.client.models.generate_content(model=model, contents=prompt)
+            return response.text
+        except Exception as e:
+            logger.error(f"[LLM] ❌ Error calling Gemini API: {e}")
+            return "Error generating response from Gemini."
+class CookingTutorChatbot:
+    """Cooking tutor chatbot that uses only web search + memory."""
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.gemini_client = GeminiClient()
+        self.memory = MemoryManager()
+    def chat(
+        self,
+        user_id: str,
+        user_query: str,
+        lang: str = "EN",
+        search_mode: bool = True,
+        video_mode: bool = False,
+        servings: int = None,
+        dietary: list = None,
+        allergens: list = None,
+        equipment: list = None,
+        time_limit_minutes: int = None,
+        skill_level: str = None,
+        cuisine: str = None,
+        structured: bool = False,
+    ) -> str:
+        # Translate to English-centric search if needed
+        if lang.upper() in {"VI", "ZH"}:
+            user_query = translate_query(user_query, lang.lower())
+        # Basic cooking relevance check
+        cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
+        query_lower = user_query.lower()
+        if not any(keyword in query_lower for keyword in cooking_keywords):
+            logger.warning(f"[SAFETY] Non-cooking query detected: {user_query}")
+            return "⚠️ I'm a cooking tutor! Please ask me about recipes, cooking techniques, ingredients, or anything food-related."
+        # Conversation memory (recent turns)
+        contextual_chunks = self.memory.get_contextual_chunks(user_id, user_query, lang)
+        # Web search context
+        search_context = ""
+        url_mapping = {}
+        source_aggregation = {}
+        video_results = []
+        if search_mode:
+            try:
+                search_context, url_mapping, source_aggregation = search_comprehensive(
+                    f"cooking technique tutorial: {user_query}",
+                    num_results=12,
+                    target_language=lang,
+                    include_videos=bool(video_mode)
+                )
+                if video_mode and source_aggregation:
+                    video_results = source_aggregation.get('sources', []) or []
+            except Exception as e:
+                logger.error(f"[SEARCH] Failed: {e}")
+        # Build prompt
+        parts = [
+            "You are a professional cooking tutor and recipe coach.",
+            "Provide step-by-step, practical instructions with exact measurements, temperatures, and timings.",
+            "Offer substitutions, variations, pantry-friendly swaps, and troubleshooting tips.",
+            "Adapt guidance to different skill levels (beginner/intermediate/advanced).",
+            "Use Markdown with headings, numbered steps, bullet lists, and short paragraphs.",
+            "Always include a concise Ingredients list when relevant.",
+            "Cite sources inline using <#ID> tags already present in the search context when applicable.",
+        ]
+        # Constraints block
+        constraints = []
+        if servings:
+            constraints.append(f"Servings: {servings}")
+        if dietary:
+            constraints.append(f"Dietary preferences: {', '.join(dietary)}")
+        if allergens:
+            constraints.append(f"Avoid allergens: {', '.join(allergens)}")
+        if equipment:
+            constraints.append(f"Available equipment: {', '.join(equipment)}")
+        if time_limit_minutes:
+            constraints.append(f"Time limit: {time_limit_minutes} minutes")
+        if skill_level:
+            constraints.append(f"Skill level: {skill_level}")
+        if cuisine:
+            constraints.append(f"Cuisine: {cuisine}")
+        if constraints:
+            parts.append("Constraints to respect:\n- " + "\n- ".join(constraints))
+        if contextual_chunks:
+            parts.append("Relevant context from previous messages:\n" + contextual_chunks)
+        if search_context:
+            parts.append("Cooking knowledge from the web (with citations):\n" + search_context)
+        parts.append(f"User's cooking question: {user_query}")
+        parts.append(f"Language to generate answer: {lang}")
+        if structured:
+            parts.append(
+                "Return a Markdown response with these sections if relevant:"
+                "\n1. Title"
+                "\n2. Summary (2-3 sentences)"
+                "\n3. Ingredients (quantities in metric and US units)"
+                "\n4. Equipment"
+                "\n5. Step-by-step Instructions (numbered)"
+                "\n6. Timing & Temperatures"
+                "\n7. Variations & Substitutions"
+                "\n8. Troubleshooting & Doneness Cues"
+                "\n9. Storage & Reheating"
+                "\n10. Sources"
+            )
+        prompt = "\n\n".join(parts)
+        response = self.gemini_client.generate_content(prompt, model=self.model_name, temperature=0.6)
+        # Process citations
+        if url_mapping:
+            response = self._process_citations(response, url_mapping)
+        # Basic cooking relevance check for response
+        if response and len(response) > 50:
+            response_lower = response.lower()
+            if not any(keyword in response_lower for keyword in cooking_keywords):
+                logger.warning(f"[SAFETY] Non-cooking response detected, redirecting to cooking topic")
+                response = "⚠️ Let's stick to cooking-related topics. Try asking about recipes, techniques, or ingredients!"
+        if user_id:
+            self.memory.add_exchange(user_id, user_query, response, lang=lang)
+        if video_mode and video_results:
+            return {
+                'text': response.strip(),
+                'videos': video_results
+            }
+        return response.strip()
+    def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
+        """Replace citation tags with actual URLs, handling both single and multiple references"""
+        # Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
+        citation_pattern = r'<#([^>]+)>'
+        def replace_citation(match):
+            citation_content = match.group(1)
+            # Split by comma and clean up each citation ID
+            citation_ids = [id_str.strip() for id_str in citation_content.split(',')]
+            urls = []
+            for citation_id in citation_ids:
+                try:
+                    doc_id = int(citation_id)
+                    if doc_id in url_mapping:
+                        url = url_mapping[doc_id]
+                        urls.append(f'<{url}>')
+                        logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
+                    else:
+                        logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
+                        urls.append(f'<#{doc_id}>')  # Keep original if URL not found
+                except ValueError:
+                    logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
+                    urls.append(f'<#{citation_id}>')  # Keep original if invalid
+            # Join multiple URLs with spaces
+            return ' '.join(urls)
+        # Replace citations with URLs
+        processed_response = re.sub(citation_pattern, replace_citation, response)
+        # Count total citations processed
+        citations_found = re.findall(citation_pattern, response)
+        total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
+                            for citation_content in citations_found)
+        logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
+        return processed_response

api/config.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# api/config.py
+import os
+import logging
+import psutil
+from typing import List
+# ✅ Environment Variables
+gemini_flash_api_key = os.getenv("FlashAPI")
+# Validate environment endpoint (only when actually running the app)
+def validate_environment():
+    if not gemini_flash_api_key:
+        raise ValueError("❌ Missing FlashAPI key for Gemini. Set env var FlashAPI.")
+# ✅ Logging Configuration
+def setup_logging():
+    """Configure logging for the application"""
+    # Silence noisy loggers
+    for name in [
+        "uvicorn.error", "uvicorn.access",
+        "fastapi", "starlette",
+        "pymongo", "gridfs",
+        "sentence_transformers", "faiss",
+        "google", "google.auth",
+    ]:
+        logging.getLogger(name).setLevel(logging.WARNING)
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s — %(name)s — %(levelname)s — %(message)s",
+        force=True
+    )
+    logger = logging.getLogger("cooking-tutor")
+    logger.setLevel(logging.DEBUG)
+    return logger
+# ✅ System Resource Monitoring
+def check_system_resources(logger):
+    """Monitor system resources and log warnings"""
+    memory = psutil.virtual_memory()
+    cpu = psutil.cpu_percent(interval=1)
+    disk = psutil.disk_usage("/")
+    logger.info(f"[System] 🔍 System Resources - RAM: {memory.percent}%, CPU: {cpu}%, Disk: {disk.percent}%")
+    if memory.percent > 85:
+        logger.warning("⚠️ High RAM usage detected!")
+    if cpu > 90:
+        logger.warning("⚠️ High CPU usage detected!")
+    if disk.percent > 90:
+        logger.warning("⚠️ High Disk usage detected!")
+# ✅ Memory Optimization
+def optimize_memory():
+    """Set environment variables for memory optimization"""
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# ✅ CORS Configuration
+CORS_ORIGINS = [
+    "http://localhost:5173",                    # Vite dev server
+    "http://localhost:3000",                    # Another vercel local dev
+    "https://cooking-tutor.vercel.app",         # ✅ Vercel frontend production URL
+]
+# No embedding/RAG models used in cooking tutor

api/retrieval.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# api/retrieval.py
+import os
+import re
+import time
+import requests
+import numpy as np
+import logging
+from typing import List, Dict
+# Database removed - cooking tutor uses web search only
+from models import summarizer
+logger = logging.getLogger("retrieval-bot")
+class RetrievalEngine:
+    def __init__(self):
+        # Database removed - cooking tutor uses web search only
+        self._reranker = None
+    def _get_reranker(self):
+        """Initialize the NVIDIA reranker on first use."""
+        if self._reranker is None:
+            self._reranker = _NvidiaReranker()
+        return self._reranker
+    @staticmethod
+    def _is_cooking_guide_text(text: str) -> bool:
+        """Heuristic to detect cooking guide content."""
+        if not text:
+            return False
+        keywords = [
+            # common cooking guide indicators
+            r"\bguideline(s)?\b", r"\bcooking practice\b", r"\brecommend(ation|ed|s)?\b",
+            r"\bshould\b", r"\bmust\b", r"\bstrongly (recommend|suggest)\b",
+            r"\brecipe\b", r"\btechnique\b", r"\bmethod\b", r"\binstruction\b",
+            r"\btemperature\b", r"\btiming\b", r"\bmeasurement\b"
+        ]
+        text_lc = text.lower()
+        return any(re.search(p, text_lc, flags=re.IGNORECASE) for p in keywords)
+    @staticmethod
+    def _extract_cooking_guide_sentences(text: str) -> str:
+        """Extract likely cooking guide sentences to reduce conversational/noisy content before summarization."""
+        if not text:
+            return ""
+        sentences = re.split(r"(?<=[.!?])\s+", text)
+        keep_patterns = [
+            r"\b(recommend|should|must|preferred|first-choice|consider)\b",
+            r"\b(temperature|timing|measurement|portion|serving)\b",
+            r"\b(ingredient|seasoning|spice|herb|sauce|marinade)\b",
+            r"\b(prepare|cook|bake|roast|grill|fry|boil|steam)\b"
+        ]
+        kept = []
+        for s in sentences:
+            s_norm = s.strip()
+            if not s_norm:
+                continue
+            if any(re.search(p, s_norm, flags=re.IGNORECASE) for p in keep_patterns):
+                kept.append(s_norm)
+        # Fallback: if filtering too aggressive, keep truncated original
+        if not kept:
+            return text[:1200]
+        return " ".join(kept)[:2000]
+    def retrieve_cooking_info(self, query: str, k: int = 5, min_sim: float = 0.8) -> list:
+        """
+        Retrieve cooking information - placeholder for web search integration
+        """
+        # This method is kept for compatibility but cooking tutor uses web search
+        logger.info(f"[Retrieval] Cooking info retrieval requested for: {query}")
+        return [""]
+    def retrieve_recipe_suggestions(self, ingredient_text: str, top_k: int = 5, min_sim: float = 0.5) -> list:
+        """
+        Retrieve recipe suggestions from ingredients - placeholder for web search integration
+        """
+        # This method is kept for compatibility but cooking tutor uses web search
+        logger.info(f"[Retrieval] Recipe suggestions requested for ingredients: {ingredient_text}")
+        return [""]
+# Global retrieval engine instance
+retrieval_engine = RetrievalEngine()
+class _NvidiaReranker:
+    """Simple client for NVIDIA NIM reranking: nvidia/rerank-qa-mistral-4b"""
+    def __init__(self):
+        self.api_key = os.getenv("NVIDIA_URI")
+        # Use provider doc model identifier
+        self.model = os.getenv("NVIDIA_RERANK_MODEL", "nv-rerank-qa-mistral-4b:1")
+        # NIM rerank endpoint (subject to environment); keep configurable
+        self.base_url = os.getenv("NVIDIA_RERANK_ENDPOINT", "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking")
+        self.timeout_s = 30
+    def rerank(self, query: str, documents: List[str]) -> List[Dict]:
+        if not self.api_key:
+            raise ValueError("NVIDIA_URI not set for reranker")
+        if not documents:
+            return []
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+        # Truncate and limit candidates to avoid 4xx
+        docs = documents[:10]
+        docs = [d[:2000] for d in docs if isinstance(d, str)]
+        # Two payload shapes based on provider doc
+        payloads = [
+            {
+                "model": self.model,
+                "query": {"text": query},
+                "passages": [{"text": d} for d in docs],
+            },
+            {
+                "model": self.model,
+                "query": query,
+                "documents": [{"text": d} for d in docs],
+            },
+        ]
+        try:
+            data = None
+            for p in payloads:
+                resp = requests.post(self.base_url, headers=headers, json=p, timeout=self.timeout_s)
+                if resp.status_code >= 400:
+                    # try next shape
+                    continue
+                data = resp.json()
+                break
+            if data is None:
+                # last attempt for diagnostics
+                resp.raise_for_status()
+            # Expecting a list with scores and indices or texts
+            results = []
+            entries = data.get("results") or data.get("data") or []
+            if isinstance(entries, list) and entries:
+                for entry in entries:
+                    # Common patterns: {index, score} or {text, score}
+                    idx = entry.get("index")
+                    text = entry.get("text") if entry.get("text") else (documents[idx] if idx is not None and idx < len(documents) else None)
+                    score = entry.get("score", 0)
+                    if text:
+                        results.append({"text": text, "score": float(score)})
+            else:
+                # Fallback: if API returns scores aligned to input order
+                scores = data.get("scores")
+                if isinstance(scores, list) and len(scores) == len(documents):
+                    for t, s in zip(documents, scores):
+                        results.append({"text": t, "score": float(s)})
+            # Sort by score desc
+            results.sort(key=lambda x: x.get("score", 0), reverse=True)
+            return results
+        except Exception as e:
+            logger.warning(f"[Reranker] Failed calling NVIDIA reranker: {e}")
+            # On failure, return original order with neutral scores
+            return [{"text": d, "score": 0.0} for d in documents]

api/routes.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# api/routes.py
+import time
+import os
+import re
+import json
+import logging
+import uuid
+from datetime import datetime, timedelta
+from fastapi import APIRouter, Request
+from fastapi.responses import JSONResponse, HTMLResponse
+from .chatbot import CookingTutorChatbot
+logger = logging.getLogger("routes")
+# Create router
+router = APIRouter()
+# Initialize cooking tutor chatbot
+chatbot = CookingTutorChatbot(
+    model_name="gemini-2.5-flash"
+)
+@router.post("/chat")
+async def chat_endpoint(req: Request):
+    """Chat endpoint (web-search only). No DB persistence, no image handling."""
+    body = await req.json()
+    user_id = body.get("user_id", "anonymous")
+    query_raw = body.get("query")
+    query = query_raw.strip() if isinstance(query_raw, str) else ""
+    lang = body.get("lang", "EN")
+    search_mode = body.get("search", True)
+    video_mode = body.get("video", False)
+    # Optional cooking constraints
+    servings = body.get("servings")
+    dietary = body.get("dietary")  # e.g., ["vegetarian", "gluten-free"]
+    allergens = body.get("allergens")  # e.g., ["peanuts", "shellfish"]
+    equipment = body.get("equipment")  # e.g., ["oven", "cast iron skillet"]
+    time_limit = body.get("time_limit_minutes")  # e.g., 30
+    skill_level = body.get("skill_level")  # beginner|intermediate|advanced
+    cuisine = body.get("cuisine")  # e.g., "Italian"
+    structured = body.get("structured", False)
+    start = time.time()
+    try:
+        answer = chatbot.chat(
+            user_id,
+            query,
+            lang,
+            search_mode,
+            video_mode,
+            servings=servings,
+            dietary=dietary,
+            allergens=allergens,
+            equipment=equipment,
+            time_limit_minutes=time_limit,
+            skill_level=skill_level,
+            cuisine=cuisine,
+            structured=structured,
+        )
+        elapsed = time.time() - start
+        # Handle response format (might be string or dict with videos)
+        if isinstance(answer, dict):
+            response_text = answer.get('text', '')
+            video_data = answer.get('videos', [])
+        else:
+            response_text = answer
+            video_data = []
+        # Final response
+        response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
+        # Include video data if available
+        if video_data:
+            response_data["videos"] = video_data
+        return JSONResponse(response_data)
+    except Exception as e:
+        logger.error(f"[REQUEST] Error processing request: {e}")
+        return JSONResponse({"response": "❌ Failed to get a response. Please try again."})
+@router.get("/check-request/{request_id}")
+async def check_request_status(request_id: str):
+    """Legacy endpoint kept for compatibility; returns not supported."""
+    return JSONResponse({"status": "unsupported"})
+@router.get("/pending-requests/{user_id}")
+async def get_pending_requests(user_id: str):
+    """Legacy endpoint kept for compatibility; returns empty list."""
+    return JSONResponse({"requests": []})
+@router.delete("/cleanup-requests")
+async def cleanup_old_requests():
+    """Legacy endpoint kept for compatibility; no-op."""
+    return JSONResponse({"deleted_count": 0})
+@router.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "cooking-tutor"}
+@router.get("/")
+async def root():
+    """Root endpoint - Landing page with redirect to main app"""
+    html_content = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>Cooking Tutor API</title>
+        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+        <style>
+            * {
+                margin: 0;
+                padding: 0;
+                box-sizing: border-box;
+            }
+            body {
+                font-family: 'Inter', sans-serif;
+                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+                min-height: 100vh;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                overflow: hidden;
+                position: relative;
+            }
+            /* Animated background particles */
+            .particles {
+                position: absolute;
+                top: 0;
+                left: 0;
+                width: 100%;
+                height: 100%;
+                overflow: hidden;
+                z-index: 1;
+            }
+            .particle {
+                position: absolute;
+                background: rgba(255, 255, 255, 0.1);
+                border-radius: 50%;
+                animation: float 6s ease-in-out infinite;
+            }
+            .particle:nth-child(1) { width: 80px; height: 80px; top: 20%; left: 10%; animation-delay: 0s; }
+            .particle:nth-child(2) { width: 120px; height: 120px; top: 60%; left: 80%; animation-delay: 2s; }
+            .particle:nth-child(3) { width: 60px; height: 60px; top: 80%; left: 20%; animation-delay: 4s; }
+            .particle:nth-child(4) { width: 100px; height: 100px; top: 10%; left: 70%; animation-delay: 1s; }
+            .particle:nth-child(5) { width: 90px; height: 90px; top: 40%; left: 50%; animation-delay: 3s; }
+            @keyframes float {
+                0%, 100% { transform: translateY(0px) rotate(0deg); opacity: 0.7; }
+                50% { transform: translateY(-20px) rotate(180deg); opacity: 1; }
+            }
+            .container {
+                background: rgba(255, 255, 255, 0.1);
+                backdrop-filter: blur(20px);
+                border: 1px solid rgba(255, 255, 255, 0.2);
+                border-radius: 24px;
+                padding: 3rem 2rem;
+                text-align: center;
+                box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
+                max-width: 500px;
+                width: 90%;
+                position: relative;
+                z-index: 2;
+                animation: slideUp 0.8s ease-out;
+            }
+            @keyframes slideUp {
+                from {
+                    opacity: 0;
+                    transform: translateY(50px);
+                }
+                to {
+                    opacity: 1;
+                    transform: translateY(0);
+                }
+            }
+            .logo {
+                width: 80px;
+                height: 80px;
+                background: linear-gradient(135deg, #f59e0b 0%, #ef4444 100%);
+                border-radius: 20px;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                margin: 0 auto 1.5rem;
+                animation: pulse 2s ease-in-out infinite;
+            }
+            @keyframes pulse {
+                0%, 100% { transform: scale(1); }
+                50% { transform: scale(1.05); }
+            }
+            .logo i {
+                font-size: 2rem;
+                color: white;
+            }
+            h1 {
+                color: white;
+                font-size: 2.5rem;
+                font-weight: 700;
+                margin-bottom: 0.5rem;
+                background: linear-gradient(135deg, #ffffff 0%, #f0f9ff 100%);
+                -webkit-background-clip: text;
+                -webkit-text-fill-color: transparent;
+                background-clip: text;
+            }
+            .subtitle {
+                color: rgba(255, 255, 255, 0.8);
+                font-size: 1.1rem;
+                margin-bottom: 2rem;
+                font-weight: 400;
+            }
+            .version {
+                color: rgba(255, 255, 255, 0.6);
+                font-size: 0.9rem;
+                margin-bottom: 2rem;
+                font-weight: 300;
+            }
+            .redirect-btn {
+                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+                color: white;
+                border: none;
+                padding: 1rem 2rem;
+                border-radius: 12px;
+                font-size: 1.1rem;
+                font-weight: 600;
+                cursor: pointer;
+                transition: all 0.3s ease;
+                text-decoration: none;
+                display: inline-flex;
+                align-items: center;
+                gap: 0.5rem;
+                box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
+                position: relative;
+                overflow: hidden;
+            }
+            .redirect-btn::before {
+                content: '';
+                position: absolute;
+                top: 0;
+                left: -100%;
+                width: 100%;
+                height: 100%;
+                background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
+                transition: left 0.5s;
+            }
+            .redirect-btn:hover::before {
+                left: 100%;
+            }
+            .redirect-btn:hover {
+                transform: translateY(-2px);
+                box-shadow: 0 12px 30px rgba(102, 126, 234, 0.4);
+            }
+            .redirect-btn:active {
+                transform: translateY(0);
+            }
+            .redirect-btn i {
+                font-size: 1.2rem;
+                transition: transform 0.3s ease;
+            }
+            .redirect-btn:hover i {
+                transform: translateX(3px);
+            }
+            .features {
+                margin-top: 2rem;
+                display: grid;
+                grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
+                gap: 1rem;
+            }
+            .feature {
+                color: rgba(255, 255, 255, 0.7);
+                font-size: 0.9rem;
+                font-weight: 500;
+            }
+            .feature i {
+                display: block;
+                font-size: 1.5rem;
+                margin-bottom: 0.5rem;
+                color: rgba(255, 255, 255, 0.9);
+            }
+            @media (max-width: 768px) {
+                .container {
+                    padding: 2rem 1.5rem;
+                    margin: 1rem;
+                }
+                h1 {
+                    font-size: 2rem;
+                }
+                .subtitle {
+                    font-size: 1rem;
+                }
+                .redirect-btn {
+                    padding: 0.8rem 1.5rem;
+                    font-size: 1rem;
+                }
+            }
+        </style>
+    </head>
+    <body>
+        <div class="particles">
+            <div class="particle"></div>
+            <div class="particle"></div>
+            <div class="particle"></div>
+            <div class="particle"></div>
+            <div class="particle"></div>
+        </div>
+        <div class="container">
+            <div class="logo">
+                <i class="fas fa-utensils"></i>
+            </div>
+            <h1>Cooking Tutor</h1>
+            <p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
+            <p class="version">API Version 1.0.0</p>
+            <a href="/" class="redirect-btn" target="_blank">
+                <i class="fas fa-external-link-alt"></i>
+                Open Frontend
+            </a>
+            <div class="features">
+                <div class="feature">
+                    <i class="fas fa-seedling"></i>
+                    Friendly
+                </div>
+                <div class="feature">
+                    <i class="fas fa-list-ol"></i>
+                    Step-by-step
+                </div>
+                <div class="feature">
+                    <i class="fas fa-globe"></i>
+                    Multi-Language
+                </div>
+            </div>
+        </div>
+        <script>
+            // Add some interactive effects
+            document.addEventListener('DOMContentLoaded', function() {
+                const btn = document.querySelector('.redirect-btn');
+                const particles = document.querySelectorAll('.particle');
+                // Add click animation
+                btn.addEventListener('click', function(e) {
+                    // Create ripple effect
+                    const ripple = document.createElement('span');
+                    const rect = this.getBoundingClientRect();
+                    const size = Math.max(rect.width, rect.height);
+                    const x = e.clientX - rect.left - size / 2;
+                    const y = e.clientY - rect.top - size / 2;
+                    ripple.style.cssText = `
+                        position: absolute;
+                        width: ${size}px;
+                        height: ${size}px;
+                        left: ${x}px;
+                        top: ${y}px;
+                        background: rgba(255, 255, 255, 0.3);
+                        border-radius: 50%;
+                        transform: scale(0);
+                        animation: ripple 0.6s ease-out;
+                        pointer-events: none;
+                    `;
+                    this.appendChild(ripple);
+                    setTimeout(() => {
+                        ripple.remove();
+                    }, 600);
+                });
+                // Add CSS for ripple animation
+                const style = document.createElement('style');
+                style.textContent = `
+                    @keyframes ripple {
+                        to {
+                            transform: scale(2);
+                            opacity: 0;
+                        }
+                    }
+                `;
+                document.head.appendChild(style);
+                // Animate particles on mouse move
+                document.addEventListener('mousemove', function(e) {
+                    const x = e.clientX / window.innerWidth;
+                    const y = e.clientY / window.innerHeight;
+                    particles.forEach((particle, index) => {
+                        const speed = (index + 1) * 0.5;
+                        const xOffset = (x - 0.5) * speed * 20;
+                        const yOffset = (y - 0.5) * speed * 20;
+                        particle.style.transform = `translate(${xOffset}px, ${yOffset}px)`;
+                    });
+                });
+            });
+        </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)

main.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# main.py - Entry point for the Cooking Tutor API
+import uvicorn
+from api.app import app
+if __name__ == "__main__":
+    print("🍳 Starting Cooking Tutor API...")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        log_level="info",
+        reload=False  # Set to True for development
+    )

memory/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Memory package
2	+ from .memory import MemoryManager

memory/memory.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# memory_updated.py
+import re, time, hashlib, asyncio, os
+from collections import defaultdict, deque
+from typing import List, Dict
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from google import genai  # must be configured in app.py and imported globally
+import logging
+from models.summarizer import get_summarizer
+_LLM_SMALL = "gemini-2.5-flash-lite-preview-06-17"
+# Load embedding model
+EMBED = SentenceTransformer("/app/model_cache", device="cpu").half()
+logger = logging.getLogger("rag-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
+api_key = os.getenv("FlashAPI")
+client = genai.Client(api_key=api_key)
+class MemoryManager:
+    def __init__(self, max_users=1000, history_per_user=20, max_chunks=60):
+        # STM: recent conversation summaries (topic + summary), up to 5 entries
+        self.stm_summaries = defaultdict(lambda: deque(maxlen=history_per_user))  # deque of {topic,text,vec,timestamp,used}
+        # Legacy raw cache (kept for compatibility if needed)
+        self.text_cache   = defaultdict(lambda: deque(maxlen=history_per_user))
+        # LTM: semantic chunk store (approx 3 chunks x 20 rounds)
+        self.chunk_index  = defaultdict(self._new_index)     # user_id -> faiss index
+        self.chunk_meta   = defaultdict(list)                #  ''  -> list[{text,tag,vec,timestamp,used}]
+        self.user_queue   = deque(maxlen=max_users)          # LRU of users
+        self.max_chunks   = max_chunks                       # hard cap per user
+        self.chunk_cache  = {}                               # hash(query+resp) -> [chunks]
+    # ---------- Public API ----------
+    def add_exchange(self, user_id: str, query: str, response: str, lang: str = "EN"):
+        self._touch_user(user_id)
+        # Keep raw record (optional)
+        self.text_cache[user_id].append(((query or "").strip(), (response or "").strip()))
+        if not response: return []
+        # Avoid re-chunking identical response
+        cache_key = hashlib.md5((query + response).encode()).hexdigest()
+        if cache_key in self.chunk_cache:
+            chunks = self.chunk_cache[cache_key]
+        else:
+            chunks = self.chunk_response(response, lang, question=query)
+            self.chunk_cache[cache_key] = chunks
+        # Update STM with merging/deduplication
+        for chunk in chunks:
+            self._upsert_stm(user_id, chunk, lang)
+        # Update LTM with merging/deduplication
+        self._upsert_ltm(user_id, chunks, lang)
+        return chunks
+    def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
+        """Return texts of chunks whose cosine similarity ≥ min_sim."""
+        if self.chunk_index[user_id].ntotal == 0:
+            return []
+        # Encode chunk
+        qvec   = self._embed(query)
+        sims, idxs = self.chunk_index[user_id].search(np.array([qvec]), k=top_k)
+        results = []
+        # Append related result with smart-decay to optimize storage and prioritize most-recent chat
+        for sim, idx in zip(sims[0], idxs[0]):
+            if idx < len(self.chunk_meta[user_id]) and sim >= min_sim:
+                chunk = self.chunk_meta[user_id][idx]
+                chunk["used"] += 1  # increment usage
+                # Decay function
+                age_sec = time.time() - chunk["timestamp"]
+                decay = 1.0 / (1.0 + age_sec / 300)  # 5-min half-life
+                score = sim * decay * (1 + 0.1 * chunk["used"])
+                # Append chunk with score
+                results.append((score, chunk))
+        # Sort result on best scored
+        results.sort(key=lambda x: x[0], reverse=True)
+        # logger.info(f"[Memory] RAG Retrieved Topic: {results}") # Inspect vector data
+        return [f"### Topic: {c['tag']}\n{c['text']}" for _, c in results]
+    def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
+        """
+        Get the most recent short-term memory summaries.
+        Returns: a list of entries containing only the summarized bot context.
+        """
+        if user_id not in self.stm_summaries:
+            return []
+        recent = list(self.stm_summaries[user_id])[-num_turns:]
+        formatted = []
+        for entry in recent:
+            formatted.append({
+                "user": "",
+                "bot": f"Topic: {entry['topic']}\n{entry['text']}",
+                "timestamp": entry.get("timestamp", time.time())
+            })
+        return formatted
+    def get_context(self, user_id: str, num_turns: int = 5) -> str:
+        # Prefer STM summaries
+        history = self.get_recent_chat_history(user_id, num_turns=num_turns)
+        return "\n".join(h["bot"] for h in history)
+    def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
+        """
+        Use NVIDIA Llama to create a summarization of relevant context from both recent history and RAG chunks.
+        This ensures conversational continuity while providing a concise summary for the main LLM.
+        """
+        # Get both types of context
+        recent_history = self.get_recent_chat_history(user_id, num_turns=5)
+        rag_chunks = self.get_relevant_chunks(user_id, current_query, top_k=3)
+        logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
+        logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")
+        # Return empty string if no context is found
+        if not recent_history and not rag_chunks:
+            logger.info(f"[Contextual] No context found, returning empty string")
+            return ""
+        # Prepare context for summarization
+        context_parts = []
+        # Add recent chat history
+        if recent_history:
+            history_text = "\n".join([
+                f"User: {item['user']}\nBot: {item['bot']}"
+                for item in recent_history
+            ])
+            context_parts.append(f"Recent conversation history:\n{history_text}")
+        # Add RAG chunks
+        if rag_chunks:
+            rag_text = "\n".join(rag_chunks)
+            context_parts.append(f"Semantically relevant historical cooking information:\n{rag_text}")
+        # Combine all context
+        full_context = "\n\n".join(context_parts)
+        # Use summarizer to create concise summary
+        try:
+            summary = summarizer.summarize_text(full_context, max_length=300)
+            logger.info(f"[Contextual] Generated summary using NVIDIA Llama: {len(summary)} characters")
+            return summary
+        except Exception as e:
+            logger.error(f"[Contextual] Summarization failed: {e}")
+            return full_context[:500] + "..." if len(full_context) > 500 else full_context
+    def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
+        """
+        Use NVIDIA Llama to chunk and summarize response by cooking topics.
+        Returns: [{"tag": ..., "text": ...}, ...]
+        """
+        if not response:
+            return []
+        try:
+            # Use summarizer to chunk and summarize
+            chunks = summarizer.chunk_response(response, max_chunk_size=500)
+            # Convert to the expected format
+            result_chunks = []
+            for i, chunk in enumerate(chunks):
+                # Extract topic from chunk (first sentence or key cooking terms)
+                topic = self._extract_topic_from_chunk(chunk)
+                result_chunks.append({
+                    "tag": topic,
+                    "text": chunk
+                })
+            logger.info(f"[Memory] 📦 NVIDIA Llama summarized {len(result_chunks)} chunks")
+            return result_chunks
+        except Exception as e:
+            logger.error(f"[Memory] NVIDIA Llama chunking failed: {e}")
+            # Fallback to simple chunking
+            return self._fallback_chunking(response)
+    def _extract_topic_from_chunk(self, chunk: str) -> str:
+        """Extract a concise topic from a chunk"""
+        # Look for cooking terms or first sentence
+        sentences = chunk.split('.')
+        if sentences:
+            first_sentence = sentences[0].strip()
+            if len(first_sentence) > 50:
+                first_sentence = first_sentence[:50] + "..."
+            return first_sentence
+        return "Cooking Information"
+    def _fallback_chunking(self, response: str) -> List[Dict]:
+        """Fallback chunking when NVIDIA Llama fails"""
+        # Simple sentence-based chunking
+        sentences = re.split(r'[.!?]+', response)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            if len(current_chunk) + len(sentence) > 300:
+                if current_chunk:
+                    chunks.append({
+                        "tag": "Cooking Information",
+                        "text": current_chunk.strip()
+                    })
+                current_chunk = sentence
+            else:
+                current_chunk += sentence + ". "
+        if current_chunk:
+            chunks.append({
+                "tag": "Cooking Information",
+                "text": current_chunk.strip()
+            })
+        return chunks
+    # ---------- Private Methods ----------
+    def _touch_user(self, user_id: str):
+        """Update LRU queue"""
+        if user_id in self.user_queue:
+            self.user_queue.remove(user_id)
+        self.user_queue.append(user_id)
+    def _new_index(self):
+        """Create new FAISS index"""
+        return faiss.IndexFlatIP(384)  # 384-dim embeddings
+    def _upsert_stm(self, user_id: str, chunk: Dict, lang: str):
+        """Update short-term memory with merging/deduplication"""
+        topic = chunk["tag"]
+        text = chunk["text"]
+        # Check for similar topics in STM
+        for entry in self.stm_summaries[user_id]:
+            if self._topics_similar(topic, entry["topic"]):
+                # Merge with existing entry
+                entry["text"] = summarizer.summarize_text(
+                    f"{entry['text']}\n{text}",
+                    max_length=200
+                )
+                entry["timestamp"] = time.time()
+                return
+        # Add new entry
+        self.stm_summaries[user_id].append({
+            "topic": topic,
+            "text": text,
+            "vec": self._embed(f"{topic} {text}"),
+            "timestamp": time.time(),
+            "used": 0
+        })
+    def _upsert_ltm(self, user_id: str, chunks: List[Dict], lang: str):
+        """Update long-term memory with merging/deduplication"""
+        for chunk in chunks:
+            # Check for similar chunks in LTM
+            similar_idx = self._find_similar_chunk(user_id, chunk["text"])
+            if similar_idx is not None:
+                # Merge with existing chunk
+                existing = self.chunk_meta[user_id][similar_idx]
+                merged_text = summarizer.summarize_text(
+                    f"{existing['text']}\n{chunk['text']}",
+                    max_length=300
+                )
+                existing["text"] = merged_text
+                existing["timestamp"] = time.time()
+            else:
+                # Add new chunk
+                if len(self.chunk_meta[user_id]) >= self.max_chunks:
+                    # Remove oldest chunk
+                    self._remove_oldest_chunk(user_id)
+                vec = self._embed(chunk["text"])
+                self.chunk_index[user_id].add(np.array([vec]))
+                self.chunk_meta[user_id].append({
+                    "text": chunk["text"],
+                    "tag": chunk["tag"],
+                    "vec": vec,
+                    "timestamp": time.time(),
+                    "used": 0
+                })
+    def _topics_similar(self, topic1: str, topic2: str) -> bool:
+        """Check if two topics are similar"""
+        # Simple similarity check based on common words
+        words1 = set(topic1.lower().split())
+        words2 = set(topic2.lower().split())
+        intersection = words1.intersection(words2)
+        return len(intersection) >= 2
+    def _find_similar_chunk(self, user_id: str, text: str) -> int:
+        """Find similar chunk in LTM"""
+        if not self.chunk_meta[user_id]:
+            return None
+        text_vec = self._embed(text)
+        sims, idxs = self.chunk_index[user_id].search(np.array([text_vec]), k=3)
+        for sim, idx in zip(sims[0], idxs[0]):
+            if sim > 0.8:  # High similarity threshold
+                return int(idx)
+        return None
+    def _remove_oldest_chunk(self, user_id: str):
+        """Remove the oldest chunk from LTM"""
+        if not self.chunk_meta[user_id]:
+            return
+        # Find oldest chunk
+        oldest_idx = min(range(len(self.chunk_meta[user_id])),
+                        key=lambda i: self.chunk_meta[user_id][i]["timestamp"])
+        # Remove from index and metadata
+        self.chunk_meta[user_id].pop(oldest_idx)
+        # Note: FAISS doesn't support direct removal, so we rebuild the index
+        self._rebuild_index(user_id)
+    def _rebuild_index(self, user_id: str):
+        """Rebuild FAISS index after removal"""
+        if not self.chunk_meta[user_id]:
+            self.chunk_index[user_id] = self._new_index()
+            return
+        vectors = [chunk["vec"] for chunk in self.chunk_meta[user_id]]
+        self.chunk_index[user_id] = self._new_index()
+        self.chunk_index[user_id].add(np.array(vectors))
+    @staticmethod
+    def _embed(text: str):
+        vec = EMBED.encode(text, convert_to_numpy=True)
+        # L2 normalise for cosine on IndexFlatIP
+        return vec / (np.linalg.norm(vec) + 1e-9)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Models package
+from .llama import NVIDIALLamaClient, process_search_query
+from .summarizer import TextSummarizer, summarizer

models/download_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# download_model.py
+### --- A. transformer and embedder ---
+import os
+import shutil
+from huggingface_hub import snapshot_download
+# Set up paths
+MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
+MODEL_CACHE_DIR = "/app/model_cache"
+print("⏳ Downloading the SentenceTransformer model...")
+model_path = snapshot_download(repo_id=MODEL_REPO, cache_dir=MODEL_CACHE_DIR)
+print("Model path: ", model_path)
+# Ensure the directory exists
+if not os.path.exists(MODEL_CACHE_DIR):
+    os.makedirs(MODEL_CACHE_DIR)
+# Move all contents from the snapshot folder
+if os.path.exists(model_path):
+    print(f"📂 Moving model files from {model_path} to {MODEL_CACHE_DIR}...")
+    for item in os.listdir(model_path):
+        source = os.path.join(model_path, item)
+        destination = os.path.join(MODEL_CACHE_DIR, item)
+        if os.path.isdir(source):
+            shutil.copytree(source, destination, dirs_exist_ok=True)
+        else:
+            shutil.copy2(source, destination)
+    print(f"✅ Model extracted and flattened in {MODEL_CACHE_DIR}")
+else:
+    print("❌ No snapshot directory found!")
+    exit(1)
+# Verify structure after moving
+print("\n📂 LLM Model Structure (Build Level):")
+for root, dirs, files in os.walk(MODEL_CACHE_DIR):
+    print(f"📁 {root}/")
+    for file in files:
+        print(f"  📄 {file}")
+### --- B. translation modules ---
+from transformers import pipeline
+print("⏬ Downloading Vietnamese–English translator...")
+_ = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en")
+print("⏬ Downloading Chinese–English translator...")
+_ = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

models/llama.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import requests
+import json
+import logging
+import time
+from typing import List, Dict, Tuple
+logger = logging.getLogger(__name__)
+class NVIDIALLamaClient:
+    def __init__(self):
+        self.api_key = os.getenv("NVIDIA_URI")
+        if not self.api_key:
+            raise ValueError("NVIDIA_URI environment variable not set")
+        # Correct NVIDIA Integrate API base
+        self.base_url = "https://integrate.api.nvidia.com/v1"
+        self.model = "meta/llama-3.1-8b-instruct"
+    def generate_keywords(self, user_query: str) -> List[str]:
+        """Use Llama to generate search keywords from user query"""
+        try:
+            prompt = f"""Given this medical question: "{user_query}"
+Generate 3-5 specific search keywords that would help find relevant medical information online.
+Focus on medical terms, symptoms, conditions, treatments, or procedures mentioned.
+Return only the keywords separated by commas, no explanations.
+Keywords:"""
+            response = self._call_llama(prompt)
+            # Extract keywords from response
+            keywords = [kw.strip() for kw in response.split(',') if kw.strip()]
+            logger.info(f"Generated keywords: {keywords}")
+            return keywords[:5]  # Limit to 5 keywords
+        except Exception as e:
+            logger.error(f"Failed to generate keywords: {e}")
+            return [user_query]  # Fallback to original query
+    def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
+        """Use Llama to summarize documents and return summary with URL mapping"""
+        try:
+            # Import summarizer here to avoid circular imports
+            from summarizer import summarizer
+            # Use the summarizer for document summarization
+            combined_summary, url_mapping = summarizer.summarize_documents(documents, user_query)
+            return combined_summary, url_mapping
+        except Exception as e:
+            logger.error(f"Failed to summarize documents: {e}")
+            return "", {}
+    def _call_llama(self, prompt: str, max_retries: int = 3) -> str:
+        """Make API call to NVIDIA Llama model with retry logic"""
+        for attempt in range(max_retries):
+            try:
+                headers = {
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json"
+                }
+                payload = {
+                    "model": self.model,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": prompt
+                        }
+                    ],
+                    "temperature": 0.7,
+                    "max_tokens": 1000
+                }
+                response = requests.post(
+                    f"{self.base_url}/chat/completions",
+                    headers=headers,
+                    json=payload,
+                    timeout=30
+                )
+                response.raise_for_status()
+                result = response.json()
+                content = result['choices'][0]['message']['content'].strip()
+                if not content:
+                    raise ValueError("Empty response from Llama API")
+                return content
+            except requests.exceptions.Timeout:
+                logger.warning(f"Llama API timeout (attempt {attempt + 1}/{max_retries})")
+                if attempt == max_retries - 1:
+                    raise
+                time.sleep(2 ** attempt)  # Exponential backoff
+            except requests.exceptions.RequestException as e:
+                logger.warning(f"Llama API request failed (attempt {attempt + 1}/{max_retries}): {e}")
+                if attempt == max_retries - 1:
+                    raise
+                time.sleep(2 ** attempt)
+            except Exception as e:
+                logger.error(f"Llama API call failed: {e}")
+                raise
+def process_search_query(user_query: str, search_results: List[Dict]) -> Tuple[str, Dict[int, str]]:
+    """Process search results using Llama model"""
+    try:
+        llama_client = NVIDIALLamaClient()
+        # Generate search keywords
+        keywords = llama_client.generate_keywords(user_query)
+        # Summarize documents
+        summary, url_mapping = llama_client.summarize_documents(search_results, user_query)
+        return summary, url_mapping
+    except Exception as e:
+        logger.error(f"Failed to process search query: {e}")
+        return "", {}

models/summarizer.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import re
+import logging
+from typing import List, Dict, Tuple
+from .llama import NVIDIALLamaClient
+logger = logging.getLogger(__name__)
+class TextSummarizer:
+    def __init__(self):
+        self.llama_client = NVIDIALLamaClient()
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text for summarization"""
+        if not text:
+            return ""
+        # Remove common conversation starters and fillers
+        conversation_patterns = [
+            r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
+            r'\b(here is|this is|let me|i will|i can|i would)\b',
+            r'\b(summarize|summary|here\'s|here is)\b',
+            r'\b(please|kindly|would you|could you)\b',
+            r'\b(um|uh|er|ah|well|so|like|you know)\b'
+        ]
+        # Remove excessive whitespace and normalize
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'\n+', ' ', text)
+        # Remove conversation patterns
+        for pattern in conversation_patterns:
+            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+        # Remove extra punctuation and normalize
+        text = re.sub(r'[.]{2,}', '.', text)
+        text = re.sub(r'[!]{2,}', '!', text)
+        text = re.sub(r'[?]{2,}', '?', text)
+        return text.strip()
+    def extract_key_phrases(self, text: str) -> List[str]:
+        """Extract key medical phrases and terms"""
+        if not text:
+            return []
+        # Medical term patterns
+        medical_patterns = [
+            r'\b(?:symptoms?|diagnosis|treatment|therapy|medication|drug|disease|condition|syndrome)\b',
+            r'\b(?:patient|doctor|physician|medical|clinical|healthcare)\b',
+            r'\b(?:blood pressure|heart rate|temperature|pulse|respiration)\b',
+            r'\b(?:acute|chronic|severe|mild|moderate|serious|critical)\b',
+            r'\b(?:pain|ache|discomfort|swelling|inflammation|infection)\b'
+        ]
+        key_phrases = []
+        for pattern in medical_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            key_phrases.extend(matches)
+        return list(set(key_phrases))  # Remove duplicates
+    def summarize_text(self, text: str, max_length: int = 200) -> str:
+        """Summarize text using NVIDIA Llama model"""
+        try:
+            if not text or len(text.strip()) < 50:
+                return text
+            # Clean the text first
+            cleaned_text = self.clean_text(text)
+            # Extract key phrases for context
+            key_phrases = self.extract_key_phrases(cleaned_text)
+            key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "medical information"
+            # Create optimized prompt
+            prompt = f"""Summarize this medical text in {max_length} characters or less. Focus only on key medical facts, symptoms, treatments, and diagnoses. Do not include greetings, confirmations, or conversational elements.
+Key terms: {key_phrases_str}
+Text: {cleaned_text[:1500]}
+Summary:"""
+            summary = self.llama_client._call_llama(prompt)
+            # Post-process summary
+            summary = self.clean_text(summary)
+            # Ensure it's within length limit
+            if len(summary) > max_length:
+                summary = summary[:max_length-3] + "..."
+            return summary
+        except Exception as e:
+            logger.error(f"Summarization failed: {e}")
+            # Fallback to simple truncation
+            return self.clean_text(text)[:max_length]
+    def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
+        """Summarize text focusing strictly on information relevant to the query.
+        Returns an empty string if nothing relevant is found.
+        """
+        try:
+            if not text:
+                return ""
+            cleaned_text = self.clean_text(text)
+            if not cleaned_text:
+                return ""
+            # Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
+            prompt = (
+                f"You extract only medically relevant facts that help answer: '{query}'. "
+                f"Respond with a concise bullet list (<= {max_length} chars total). "
+                "If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
+                f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
+            )
+            summary = self.llama_client._call_llama(prompt)
+            summary = self.clean_text(summary)
+            if not summary or summary.upper().strip() == "NONE":
+                return ""
+            if len(summary) > max_length:
+                summary = summary[:max_length-3] + "..."
+            return summary
+        except Exception as e:
+            logger.warning(f"Query-focused summarization failed: {e}")
+            return ""
+    def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
+        """Summarize multiple documents with URL mapping"""
+        try:
+            doc_summaries = []
+            url_mapping = {}
+            for doc in documents:
+                doc_id = doc['id']
+                url_mapping[doc_id] = doc['url']
+                # Create focused summary for each document
+                summary_prompt = f"""Summarize this medical document in 2-3 sentences, focusing on information relevant to: "{user_query}"
+Document: {doc['title']}
+Content: {doc['content'][:800]}
+Key medical information:"""
+                summary = self.llama_client._call_llama(summary_prompt)
+                summary = self.clean_text(summary)
+                doc_summaries.append(f"Document {doc_id}: {summary}")
+            combined_summary = "\n\n".join(doc_summaries)
+            return combined_summary, url_mapping
+        except Exception as e:
+            logger.error(f"Document summarization failed: {e}")
+            return "", {}
+    def summarize_conversation_chunk(self, chunk: str) -> str:
+        """Summarize a conversation chunk for memory"""
+        try:
+            if not chunk or len(chunk.strip()) < 30:
+                return chunk
+            cleaned_chunk = self.clean_text(chunk)
+            prompt = f"""Summarize this medical conversation in 1-2 sentences. Focus only on medical facts, symptoms, treatments, or diagnoses discussed. Remove greetings and conversational elements.
+Conversation: {cleaned_chunk[:1000]}
+Medical summary:"""
+            summary = self.llama_client._call_llama(prompt)
+            return self.clean_text(summary)
+        except Exception as e:
+            logger.error(f"Conversation summarization failed: {e}")
+            return self.clean_text(chunk)[:150]
+    def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
+        """Split response into chunks and summarize each"""
+        try:
+            if not response or len(response) <= max_chunk_size:
+                return [response]
+            # Split by sentences first
+            sentences = re.split(r'[.!?]+', response)
+            chunks = []
+            current_chunk = ""
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if not sentence:
+                    continue
+                # Check if adding this sentence would exceed limit
+                if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
+                    chunks.append(self.summarize_conversation_chunk(current_chunk))
+                    current_chunk = sentence
+                else:
+                    current_chunk += sentence + ". "
+            # Add the last chunk
+            if current_chunk:
+                chunks.append(self.summarize_conversation_chunk(current_chunk))
+            return chunks
+        except Exception as e:
+            logger.error(f"Response chunking failed: {e}")
+            return [response]
+# Global summarizer instance
+summarizer = TextSummarizer()

models/warmup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from sentence_transformers import SentenceTransformer
+import torch
+print("🚀 Warming up model...")
+embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
+# embedding_model = embedding_model.half()  # Reduce memory
+embedding_model.to(torch.device("cpu"))
+print("✅ Model warm-up complete!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# requirements.txt - Cooking Tutor API
+# **LLMs**
+google-genai
+huggingface_hub
+# **Memory & Embeddings**
+faiss-cpu
+sentence-transformers
+# **Translation**
+transformers
+accelerate
+sentencepiece
+# **Environment**
+python-dotenv
+# **Deployment**
+uvicorn
+fastapi
+torch               # For translation models
+psutil              # System monitoring
+# **Web Search**
+requests
+beautifulsoup4
+langdetect
+# **Data Processing**
+pandas
+numpy

search/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

search/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Search package
+from .search import WebSearcher, search_web, search_web_with_content, search_medical, search_multilingual_medical, search_videos, search_comprehensive
+from .coordinator import SearchCoordinator
+from .engines import DuckDuckGoEngine, MedicalSearchEngine, MultilingualMedicalEngine, VideoSearchEngine
+from .extractors import ContentExtractor
+from .processors import MedicalSearchProcessor, LanguageProcessor, SourceAggregator, EnhancedContentProcessor
+__all__ = [
+    'WebSearcher',
+    'search_web',
+    'search_web_with_content',
+    'search_medical',
+    'search_multilingual_medical',
+    'search_videos',
+    'search_comprehensive',
+    'SearchCoordinator',
+    'DuckDuckGoEngine',
+    'MedicalSearchEngine',
+    'MultilingualMedicalEngine',
+    'VideoSearchEngine',
+    'ContentExtractor',
+    'MedicalSearchProcessor',
+    'LanguageProcessor',
+    'SourceAggregator',
+    'EnhancedContentProcessor'
+]

search/coordinator.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import logging
+from typing import List, Dict, Tuple
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .engines.duckduckgo import DuckDuckGoEngine
+from .engines.cooking import CookingSearchEngine
+from .engines.multilingual import MultilingualCookingEngine
+from .engines.video import VideoSearchEngine
+from .extractors.content import ContentExtractor
+from .processors.cooking import CookingSearchProcessor
+from .processors.language import LanguageProcessor
+from .processors.sources import SourceAggregator
+from .processors.enhanced import EnhancedContentProcessor
+# Reranker removed - using simple relevance scoring for cooking content
+logger = logging.getLogger(__name__)
+class SearchCoordinator:
+    """Coordinate multiple search strategies for comprehensive cooking information"""
+    def __init__(self, max_workers: int = 3):
+        self.max_workers = max_workers
+        # Initialize search engines
+        self.duckduckgo_engine = DuckDuckGoEngine()
+        self.cooking_engine = CookingSearchEngine()
+        self.multilingual_engine = MultilingualCookingEngine()
+        self.video_engine = VideoSearchEngine()
+        # Initialize processors
+        self.content_extractor = ContentExtractor()
+        self.cooking_processor = CookingSearchProcessor()
+        self.language_processor = LanguageProcessor()
+        self.source_aggregator = SourceAggregator()
+        self.enhanced_processor = EnhancedContentProcessor()
+        self.reranker = None  # No complex reranking needed for cooking content
+        # Search strategies
+        self.strategies = [
+            self._search_multilingual,
+            self._search_duckduckgo,
+            self._search_cooking_sources
+        ]
+    def search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
+        """Execute comprehensive multilingual search with multiple strategies"""
+        logger.info(f"Starting comprehensive multilingual search for: {query}")
+        # Detect and enhance query for multiple languages
+        enhanced_queries = self.language_processor.enhance_query(query, target_language)
+        logger.info(f"Enhanced queries: {list(enhanced_queries.keys())}")
+        # Execute search strategies in parallel
+        all_results = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit search tasks for each language
+            future_to_strategy = {}
+            for lang, enhanced_query in enhanced_queries.items():
+                for strategy in self.strategies:
+                    future = executor.submit(strategy, enhanced_query, num_results // len(enhanced_queries), lang)
+                    future_to_strategy[future] = f"{strategy.__name__}_{lang}"
+            # Collect results
+            for future in as_completed(future_to_strategy):
+                strategy_name = future_to_strategy[future]
+                try:
+                    results = future.result()
+                    if results:
+                        all_results.extend(results)
+                        logger.info(f"{strategy_name} found {len(results)} results")
+                except Exception as e:
+                    logger.error(f"{strategy_name} failed: {e}")
+        # Remove duplicates and filter by language preference
+        unique_results = self._remove_duplicates(all_results)
+        if target_language:
+            unique_results = self.language_processor.filter_by_language(unique_results, target_language)
+        logger.info(f"Total unique results: {len(unique_results)}")
+        # Extract content from URLs
+        enriched_results = self._enrich_with_content(unique_results)
+        # Simple cooking relevance filtering
+        if enriched_results:
+            cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
+            relevant_results = []
+            for result in enriched_results:
+                title = result.get('title', '').lower()
+                content = result.get('content', '').lower()
+                if any(keyword in title or keyword in content for keyword in cooking_keywords):
+                    relevant_results.append(result)
+            if relevant_results:
+                enriched_results = relevant_results
+                logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
+        # Process results into comprehensive summary
+        summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
+        logger.info(f"Multilingual search completed: {len(url_mapping)} sources processed")
+        return summary, url_mapping
+    def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
+        """Search using multilingual medical engine"""
+        try:
+            if language:
+                results = self.multilingual_engine.search_by_language(query, language, num_results)
+            else:
+                results = self.multilingual_engine.search(query, num_results)
+            return results
+        except Exception as e:
+            logger.error(f"Multilingual search failed: {e}")
+            return []
+    def _search_duckduckgo(self, query: str, num_results: int, language: str = None) -> List[Dict]:
+        """Search using DuckDuckGo engine"""
+        try:
+            results = self.duckduckgo_engine.search(query, num_results)
+            return results
+        except Exception as e:
+            logger.error(f"DuckDuckGo search failed: {e}")
+            return []
+    def _search_cooking_sources(self, query: str, num_results: int, language: str = None) -> List[Dict]:
+        """Search using cooking sources engine"""
+        try:
+            results = self.cooking_engine.search(query, num_results)
+            return results
+        except Exception as e:
+            logger.error(f"Cooking sources search failed: {e}")
+            return []
+    def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
+        """Remove duplicate results based on URL"""
+        seen_urls = set()
+        unique_results = []
+        for result in results:
+            url = result.get('url', '')
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                unique_results.append(result)
+        return unique_results
+    def _enrich_with_content(self, results: List[Dict]) -> List[Dict]:
+        """Enrich results with extracted content"""
+        enriched_results = []
+        # Extract content in parallel
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit content extraction tasks
+            future_to_result = {
+                executor.submit(self.content_extractor.extract, result['url']): result
+                for result in results
+            }
+            # Collect enriched results
+            for future in as_completed(future_to_result):
+                original_result = future_to_result[future]
+                try:
+                    content = future.result()
+                    if content:
+                        enriched_result = original_result.copy()
+                        enriched_result['content'] = content
+                        enriched_results.append(enriched_result)
+                except Exception as e:
+                    logger.warning(f"Content extraction failed for {original_result['url']}: {e}")
+                    # Still include result without content
+                    enriched_results.append(original_result)
+        return enriched_results
+    def quick_search(self, query: str, num_results: int = 5) -> List[Dict]:
+        """Quick search for basic results without content extraction"""
+        logger.info(f"Quick search for: {query}")
+        # Use only DuckDuckGo for speed
+        results = self.duckduckgo_engine.search(query, num_results)
+        # If no results, try with simplified query
+        if not results:
+            logger.warning("No results from DuckDuckGo, trying simplified query")
+            simplified_query = self._simplify_query(query)
+            if simplified_query != query:
+                results = self.duckduckgo_engine.search(simplified_query, num_results)
+                logger.info(f"Simplified query '{simplified_query}' found {len(results)} results")
+        # If still no results, try cooking engine as fallback
+        if not results:
+            logger.warning("Still no results, trying cooking engine fallback")
+            try:
+                cooking_results = self.cooking_engine.search(query, num_results)
+                if cooking_results:
+                    results = cooking_results
+                    logger.info(f"Cooking engine fallback found {len(results)} results")
+            except Exception as e:
+                logger.warning(f"Cooking engine fallback failed: {e}")
+        # Remove duplicates
+        unique_results = self._remove_duplicates(results)
+        # If we still have no results, create a basic fallback
+        if not unique_results:
+            logger.warning("No search results found, creating basic fallback")
+            unique_results = self._create_fallback_results(query)
+        logger.info(f"Quick search completed: {len(unique_results)} results")
+        return unique_results
+    def _simplify_query(self, query: str) -> str:
+        """Simplify query to core cooking terms"""
+        if not query:
+            return ""
+        # Extract key cooking terms
+        import re
+        words = query.split()
+        # Keep cooking keywords and important terms
+        cooking_keywords = [
+            'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
+            'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
+            'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
+            'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
+            'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
+            'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
+            'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai'
+        ]
+        # Keep words that are cooking keywords or are important (longer than 3 chars)
+        important_words = []
+        for word in words:
+            word_lower = word.lower()
+            if word_lower in cooking_keywords or len(word) > 3:
+                important_words.append(word)
+        # If we have important words, use them; otherwise use first few words
+        if important_words:
+            return ' '.join(important_words[:5])  # Max 5 words
+        else:
+            return ' '.join(words[:3])  # Max 3 words
+    def _create_fallback_results(self, query: str) -> List[Dict]:
+        """Create basic fallback results when search fails"""
+        # Create some basic cooking information URLs as fallback
+        fallback_urls = [
+            "https://www.allrecipes.com",
+            "https://www.foodnetwork.com",
+            "https://www.epicurious.com",
+            "https://www.seriouseats.com",
+            "https://www.bonappetit.com"
+        ]
+        results = []
+        for i, url in enumerate(fallback_urls[:3]):  # Limit to 3 fallback results
+            results.append({
+                'url': url,
+                'title': f"Cooking Information - {query}",
+                'source': 'fallback',
+                'composite_score': 0.3 - (i * 0.05)  # Decreasing score
+            })
+        return results
+    def cooking_focus_search(self, query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
+        """Cooking-focused search with enhanced processing"""
+        logger.info(f"Cooking focus search for: {query}")
+        # Use cooking engine primarily
+        cooking_results = self.cooking_engine.search(query, num_results)
+        # Add some general results for context
+        general_results = self.duckduckgo_engine.search(query, 3)
+        # Combine and deduplicate
+        all_results = self._remove_duplicates(cooking_results + general_results)
+        # Enrich with content
+        enriched_results = self._enrich_with_content(all_results)
+        # Simple cooking relevance filtering
+        if enriched_results:
+            cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
+            relevant_results = []
+            for result in enriched_results:
+                title = result.get('title', '').lower()
+                content = result.get('content', '').lower()
+                if any(keyword in title or keyword in content for keyword in cooking_keywords):
+                    relevant_results.append(result)
+            if relevant_results:
+                enriched_results = relevant_results
+                logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
+        # Process with cooking focus
+        summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
+        logger.info(f"Cooking focus search completed: {len(url_mapping)} sources")
+        return summary, url_mapping
+    def multilingual_cooking_search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
+        """Comprehensive multilingual cooking search"""
+        logger.info(f"Multilingual cooking search for: {query} (target: {target_language})")
+        # Detect source language
+        source_language = self.language_processor.detect_language(query)
+        logger.info(f"Detected source language: {source_language}")
+        # Use multilingual search with language preference
+        summary, url_mapping = self.search(query, num_results, target_language)
+        logger.info(f"Multilingual cooking search completed: {len(url_mapping)} sources")
+        return summary, url_mapping
+    def comprehensive_search(self, query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
+        """Comprehensive search with maximum information extraction and detailed references"""
+        logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
+        # Detect source language
+        source_language = self.language_processor.detect_language(query)
+        logger.info(f"Detected source language: {source_language}")
+        # Execute comprehensive search
+        search_results = []
+        video_results = []
+        # 1. Multilingual text search
+        text_summary, text_url_mapping = self.search(query, num_results, target_language)
+        # 2. Video search if requested
+        if include_videos:
+            try:
+                video_results = self.video_search(query, num_results=5, target_language=target_language)
+                logger.info(f"Video search found {len(video_results)} videos")
+            except Exception as e:
+                logger.warning(f"Video search failed: {e}")
+        # 3. Aggregate all sources
+        all_sources = []
+        # Add text sources
+        for i, url in text_url_mapping.items():
+            # Find corresponding source data
+            source_data = self._find_source_data(url, text_url_mapping)
+            if source_data:
+                all_sources.append(source_data)
+        # Add video sources
+        for video in video_results:
+            all_sources.append(video)
+        # 4. Process with enhanced content processor
+        if all_sources:
+            comprehensive_summary, detailed_mapping = self.enhanced_processor.process_comprehensive_content(all_sources, query)
+        else:
+            comprehensive_summary = text_summary
+            detailed_mapping = text_url_mapping
+        # 5. Create comprehensive source aggregation
+        source_aggregation = self.source_aggregator.aggregate_sources(all_sources, video_results)
+        # 6. Generate comprehensive references
+        comprehensive_references = self.source_aggregator.create_comprehensive_references(all_sources, max_references=20)
+        # 7. Add inline citations
+        final_summary = self.enhanced_processor.create_inline_citations(comprehensive_summary, detailed_mapping)
+        # 8. Add source statistics
+        source_stats = self.enhanced_processor.generate_source_statistics(all_sources)
+        # 9. Combine everything
+        final_response = f"{final_summary}\n\n{comprehensive_references}\n\n{source_stats}"
+        logger.info(f"Comprehensive search completed: {len(all_sources)} total sources processed")
+        return final_response, detailed_mapping, source_aggregation
+    def _find_source_data(self, url: str, url_mapping: Dict[int, str]) -> Dict:
+        """Find source data for a given URL"""
+        # This is a simplified version - ensure required fields always exist
+        return {
+            'url': url,
+            'title': f"Source: {url}",
+            'content': '',
+            'domain': self._extract_domain(url),
+            'type': 'text',
+            'source_type': 'text',
+            'language': 'en',
+            'source_name': '',
+            'platform': ''
+        }
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL"""
+        try:
+            from urllib.parse import urlparse
+            parsed = urlparse(url)
+            domain = parsed.netloc.lower()
+            if domain.startswith('www.'):
+                domain = domain[4:]
+            return domain
+        except:
+            return ''
+    def video_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
+        """Search for cooking videos across multiple platforms"""
+        logger.info(f"Video search for: {query} (target: {target_language})")
+        # Detect language if not provided
+        if not target_language:
+            target_language = self.language_processor.detect_language(query)
+        # Map language codes
+        lang_mapping = {
+            'EN': 'en',
+            'VI': 'vi',
+            'ZH': 'zh',
+            'en': 'en',
+            'vi': 'vi',
+            'zh': 'zh'
+        }
+        search_language = lang_mapping.get(target_language, 'en')
+        # Search for videos
+        raw_results = self.video_engine.search(query, num_results, search_language)
+        # Simple video relevance filtering
+        cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
+        filtered_video_results = []
+        for result in raw_results:
+            title = result.get('title', '').lower()
+            if any(keyword in title for keyword in cooking_keywords):
+                filtered_video_results.append(result)
+        # Validate and normalize results to avoid corrupted cards/links
+        video_results = self._sanitize_video_results(filtered_video_results, limit=num_results)
+        logger.info(f"Video search completed: {len(video_results)} videos found")
+        return video_results
+    def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
+        """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
+        Drop unreachable/broken items and deduplicate by URL.
+        """
+        from urllib.parse import urlparse
+        import requests
+        clean: List[Dict] = []
+        seen = set()
+        for item in results or []:
+            url = (item or {}).get('url', '')
+            title = (item or {}).get('title', '').strip()
+            if not url or not title:
+                continue
+            try:
+                parsed = urlparse(url)
+                if parsed.scheme not in ('http', 'https'):
+                    continue
+                if not parsed.netloc:
+                    continue
+                # Quick reachability check; YouTube often blocks HEAD, so skip strict checks for youtube domain
+                host = parsed.netloc.lower()
+                norm_url = url
+                if 'youtube.com' not in host:
+                    try:
+                        r = requests.head(url, allow_redirects=True, timeout=3)
+                        if r.status_code >= 400:
+                            continue
+                        norm_url = getattr(r, 'url', url) or url
+                    except Exception:
+                        # If HEAD blocked, try a light GET with small timeout
+                        try:
+                            r = requests.get(url, stream=True, timeout=4)
+                            if r.status_code >= 400:
+                                continue
+                            norm_url = getattr(r, 'url', url) or url
+                        except Exception:
+                            continue
+                if norm_url in seen:
+                    continue
+                seen.add(norm_url)
+                platform = parsed.netloc.lower()
+                if platform.startswith('www.'):
+                    platform = platform[4:]
+                clean.append({
+                    'title': title,
+                    'url': norm_url,
+                    'thumbnail': item.get('thumbnail', ''),
+                    'source': item.get('source', platform.split('.')[0]),
+                    'platform': platform,
+                    'language': item.get('language', 'en')
+                })
+                if len(clean) >= limit:
+                    break
+            except Exception:
+                continue
+        return clean

search/engines/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .duckduckgo import DuckDuckGoEngine
+from .medical import MedicalSearchEngine
+from .multilingual import MultilingualMedicalEngine
+from .video import VideoSearchEngine
+__all__ = ['DuckDuckGoEngine', 'MedicalSearchEngine', 'MultilingualMedicalEngine', 'VideoSearchEngine']

search/engines/cooking.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import List, Dict
+import time
+logger = logging.getLogger(__name__)
+class CookingSearchEngine:
+    """Specialized cooking search engine with curated sources"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        })
+        self.timeout = timeout
+        # Curated cooking sources
+        self.cooking_sources = {
+            'allrecipes': {
+                'base_url': 'https://www.allrecipes.com',
+                'search_url': 'https://www.allrecipes.com/search',
+                'domains': ['allrecipes.com']
+            },
+            'food_network': {
+                'base_url': 'https://www.foodnetwork.com',
+                'search_url': 'https://www.foodnetwork.com/search',
+                'domains': ['foodnetwork.com']
+            },
+            'epicurious': {
+                'base_url': 'https://www.epicurious.com',
+                'search_url': 'https://www.epicurious.com/search',
+                'domains': ['epicurious.com']
+            },
+            'serious_eats': {
+                'base_url': 'https://www.seriouseats.com',
+                'search_url': 'https://www.seriouseats.com/search',
+                'domains': ['seriouseats.com']
+            },
+            'bon_appetit': {
+                'base_url': 'https://www.bonappetit.com',
+                'search_url': 'https://www.bonappetit.com/search',
+                'domains': ['bonappetit.com']
+            }
+        }
+    def search(self, query: str, num_results: int = 10) -> List[Dict]:
+        """Search cooking sources for relevant information"""
+        results = []
+        # Strategy 1: Direct cooking source searches
+        for source_name, source_config in self.cooking_sources.items():
+            if len(results) >= num_results:
+                break
+            source_results = self._search_cooking_source(query, source_name, source_config)
+            results.extend(source_results)
+            # Add delay between requests
+            time.sleep(0.5)
+        # Strategy 2: Cooking fallback sources
+        if len(results) < num_results:
+            fallback_results = self._get_fallback_sources(query, num_results - len(results))
+            results.extend(fallback_results)
+        return results[:num_results]
+    def _search_cooking_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
+        """Search a specific cooking source"""
+        try:
+            search_url = source_config.get('search_url')
+            if not search_url:
+                return []
+            params = {
+                'q': query,
+                'query': query,
+                'search': query
+            }
+            response = self.session.get(search_url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Source-specific selectors
+            selectors = self._get_source_selectors(source_name)
+            for selector in selectors:
+                links = soup.select(selector)
+                if links:
+                    logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
+                    break
+            for link in links[:3]:  # Limit per source
+                try:
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    # Make absolute URL
+                    if href.startswith('/'):
+                        href = source_config['base_url'] + href
+                    title = link.get_text(strip=True)
+                    if title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': source_name,
+                            'domain': source_config['domains'][0]
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing {source_name} link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Cooking source {source_name} search failed: {e}")
+            return []
+    def _get_source_selectors(self, source_name: str) -> List[str]:
+        """Get CSS selectors for specific cooking sources"""
+        selectors_map = {
+            'allrecipes': [
+                'a[href*="/recipe/"]',
+                'a[href*="/recipes/"]',
+                '.search-result a',
+                '.result-title a'
+            ],
+            'food_network': [
+                'a[href*="/recipes/"]',
+                '.search-result a',
+                '.result-title a',
+                'a[href*="/recipe/"]'
+            ],
+            'epicurious': [
+                'a[href*="/recipes/"]',
+                '.search-result a',
+                '.result-title a',
+                'a[href*="/recipe/"]'
+            ],
+            'serious_eats': [
+                'a[href*="/recipes/"]',
+                '.search-result a',
+                '.result-title a',
+                'a[href*="/recipe/"]'
+            ],
+            'bon_appetit': [
+                'a[href*="/recipes/"]',
+                '.search-result a',
+                '.result-title a',
+                'a[href*="/recipe/"]'
+            ]
+        }
+        return selectors_map.get(source_name, ['a[href*="http"]'])
+    def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
+        """Get fallback cooking sources when direct search fails"""
+        fallback_sources = [
+            {
+                'url': 'https://www.allrecipes.com/recipes',
+                'title': f'AllRecipes: {query}',
+                'source': 'allrecipes_fallback',
+                'domain': 'allrecipes.com'
+            },
+            {
+                'url': 'https://www.foodnetwork.com/recipes',
+                'title': f'Food Network: {query}',
+                'source': 'foodnetwork_fallback',
+                'domain': 'foodnetwork.com'
+            },
+            {
+                'url': 'https://www.epicurious.com/recipes-menus',
+                'title': f'Epicurious: {query}',
+                'source': 'epicurious_fallback',
+                'domain': 'epicurious.com'
+            },
+            {
+                'url': 'https://www.seriouseats.com/recipes',
+                'title': f'Serious Eats: {query}',
+                'source': 'seriouseats_fallback',
+                'domain': 'seriouseats.com'
+            },
+            {
+                'url': 'https://www.bonappetit.com/recipes',
+                'title': f'Bon Appétit: {query}',
+                'source': 'bonappetit_fallback',
+                'domain': 'bonappetit.com'
+            }
+        ]
+        return fallback_sources[:num_results]

search/engines/duckduckgo.py ADDED Viewed

	@@ -0,0 +1,599 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import List, Dict
+import time
+from models.reranker import MedicalReranker
+logger = logging.getLogger(__name__)
+class DuckDuckGoEngine:
+    """DuckDuckGo search engine with multiple strategies"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+        self.timeout = timeout
+        self.reranker = MedicalReranker()
+    def search(self, query: str, num_results: int = 10) -> List[Dict]:
+        """Search with multiple DuckDuckGo strategies and medical focus"""
+        # Clean and simplify the query first
+        clean_query = self._clean_query(query)
+        logger.info(f"Cleaned query: '{query}' -> '{clean_query}'")
+        results = []
+        min_score = 0.15  # Reduced from 0.3 to be less strict
+        # Strategy 1: HTML Interface with medical focus
+        html_results = self._search_html(clean_query, num_results * 3)  # Get more to filter
+        if html_results:
+            results.extend(html_results)
+            logger.info(f"DuckDuckGo HTML found {len(html_results)} results")
+        # Strategy 2: Instant Answer API
+        if len(results) < num_results * 2:
+            api_results = self._search_api(clean_query, num_results)
+            if api_results:
+                results.extend(api_results)
+                logger.info(f"DuckDuckGo API found {len(api_results)} results")
+        # Strategy 3: Lite Interface (mobile-friendly)
+        if len(results) < num_results * 2:
+            lite_results = self._search_lite(clean_query, num_results)
+            if lite_results:
+                results.extend(lite_results)
+                logger.info(f"DuckDuckGo Lite found {len(lite_results)} results")
+        # If still no results, try with even simpler query
+        if not results:
+            simple_query = self._simplify_query(clean_query)
+            if simple_query != clean_query:
+                logger.info(f"Trying simplified query: '{simple_query}'")
+                html_results = self._search_html(simple_query, num_results * 2)
+                if html_results:
+                    results.extend(html_results)
+                    logger.info(f"Simplified query found {len(html_results)} results")
+        # If still no results, try fallback search engines
+        if not results:
+            logger.warning("DuckDuckGo failed, trying fallback search engines")
+            fallback_results = self._fallback_search(clean_query, num_results)
+            if fallback_results:
+                results.extend(fallback_results)
+                logger.info(f"Fallback search found {len(fallback_results)} results")
+        # Filter out irrelevant results first (less aggressive)
+        filtered_results = self._filter_irrelevant_sources(results)
+        logger.info(f"Filtered {len(results)} results to {len(filtered_results)} relevant results")
+        # If we have results, use reranker; otherwise return what we have
+        if filtered_results:
+            try:
+                reranked_results = self.reranker.rerank_results(clean_query, filtered_results, min_score)
+                logger.info(f"Reranked {len(filtered_results)} results to {len(reranked_results)} high-quality results")
+                # If reranking filtered out too many results, be more lenient
+                if len(reranked_results) < min(3, num_results) and len(filtered_results) > 0:
+                    logger.warning(f"Reranking too strict ({len(reranked_results)} results), using fallback with lower threshold")
+                    # Try with even lower threshold
+                    fallback_results = self.reranker.rerank_results(clean_query, filtered_results, 0.05)
+                    if len(fallback_results) > len(reranked_results):
+                        return fallback_results[:num_results]
+                    else:
+                        # Last resort: return original filtered results with basic scoring
+                        for i, result in enumerate(filtered_results[:num_results]):
+                            result['composite_score'] = 0.5 - (i * 0.05)  # Decreasing score
+                        return filtered_results[:num_results]
+                return reranked_results[:num_results]
+            except Exception as e:
+                logger.warning(f"Reranking failed: {e}, returning filtered results")
+                return filtered_results[:num_results]
+        return filtered_results[:num_results]
+    def _clean_query(self, query: str) -> str:
+        """Clean and normalize search query"""
+        if not query:
+            return ""
+        # Remove bullet points and special characters
+        import re
+        cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query)  # Remove bullet points
+        cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned)  # Keep only alphanumeric, spaces, hyphens, dots
+        cleaned = re.sub(r'\s+', ' ', cleaned)  # Normalize whitespace
+        cleaned = cleaned.strip()
+        # Remove common prefixes that might confuse search
+        prefixes_to_remove = [
+            r'^(en|vi|zh)\s*:\s*',
+            r'^(search|find|look for)\s+',
+            r'^(how to|what is|what are)\s+',
+        ]
+        for prefix in prefixes_to_remove:
+            cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
+        return cleaned.strip()
+    def _simplify_query(self, query: str) -> str:
+        """Simplify query to core medical terms"""
+        if not query:
+            return ""
+        # Extract key medical terms
+        import re
+        words = query.split()
+        # Keep medical keywords and important terms
+        medical_keywords = [
+            'migraine', 'headache', 'pain', 'treatment', 'therapy', 'medication', 'drug',
+            'chronic', 'acute', 'symptoms', 'diagnosis', 'prevention', 'management',
+            'disease', 'condition', 'syndrome', 'disorder', 'infection', 'inflammation',
+            'blood', 'heart', 'lung', 'brain', 'liver', 'kidney', 'diabetes', 'cancer',
+            'covid', 'flu', 'cold', 'fever', 'cough', 'breathing', 'chest', 'stomach'
+        ]
+        # Keep words that are medical keywords or are important (longer than 3 chars)
+        important_words = []
+        for word in words:
+            word_lower = word.lower()
+            if word_lower in medical_keywords or len(word) > 3:
+                important_words.append(word)
+        # If we have important words, use them; otherwise use first few words
+        if important_words:
+            return ' '.join(important_words[:5])  # Max 5 words
+        else:
+            return ' '.join(words[:3])  # Max 3 words
+    def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
+        """Filter out irrelevant sources like generic health pages, quizzes, etc."""
+        import re
+        filtered = []
+        # Only exclude obvious non-medical content
+        exclude_patterns = [
+            r'/quiz$',  # Quiz pages (end of URL)
+            r'/test$',  # Test pages (end of URL)
+            r'/assessment',  # Assessment pages
+            r'/survey',  # Survey pages
+            r'homepage|main page|index',  # Homepage/index pages
+            r'login|sign.up|register',  # Auth pages
+            r'contact|about.us|privacy',  # Info pages
+            r'subscribe|newsletter|rss',  # Subscription pages
+            r'sitemap',  # Navigation pages
+        ]
+        for result in results:
+            url = result.get('url', '').lower()
+            title = result.get('title', '').lower()
+            # Skip if matches exclude patterns
+            should_exclude = False
+            for pattern in exclude_patterns:
+                if re.search(pattern, url) or re.search(pattern, title):
+                    should_exclude = True
+                    logger.debug(f"Excluding irrelevant source: {url}")
+                    break
+            if not should_exclude:
+                filtered.append(result)
+        # If we filtered out too many, be less aggressive
+        if len(filtered) < len(results) * 0.3:  # If we kept less than 30%
+            logger.warning(f"Filtering too aggressive, keeping more results: {len(results)} -> {len(filtered)}")
+            # Return original results with minimal filtering
+            minimal_filtered = []
+            for result in results:
+                url = result.get('url', '').lower()
+                if not any(re.search(pattern, url) for pattern in [r'login', r'sign.up', r'register']):
+                    minimal_filtered.append(result)
+            return minimal_filtered
+        return filtered
+    def _search_html(self, query: str, num_results: int) -> List[Dict]:
+        """Search using DuckDuckGo HTML interface with better error handling"""
+        try:
+            # Try multiple DuckDuckGo endpoints
+            endpoints = [
+                {
+                    'url': 'https://html.duckduckgo.com/html/',
+                    'params': {
+                'q': query,
+                'kl': 'us-en',
+                        's': '0',
+                        'dc': '1',
+                        'v': 'l'
+                    }
+                },
+                {
+                    'url': 'https://lite.duckduckgo.com/lite/',
+                    'params': {
+                        'q': query,
+                        'kl': 'us-en'
+                    }
+                },
+                {
+                    'url': 'https://duckduckgo.com/html/',
+                    'params': {
+                        'q': query,
+                        'kl': 'us-en'
+                    }
+                }
+            ]
+            for endpoint in endpoints:
+                try:
+                    # Add random delay to avoid rate limiting
+                    import time
+                    time.sleep(0.5)
+                    # Update headers to look more like a real browser
+                    headers = self.session.headers.copy()
+                    headers.update({
+                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                        'Accept-Language': 'en-US,en;q=0.5',
+                        'Accept-Encoding': 'gzip, deflate',
+                        'DNT': '1',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                    })
+                    response = self.session.get(
+                        endpoint['url'],
+                        params=endpoint['params'],
+                        headers=headers,
+                        timeout=self.timeout
+                    )
+                    if response.status_code == 403:
+                        logger.warning(f"DuckDuckGo endpoint {endpoint['url']} returned 403, trying next...")
+                        continue
+                    elif response.status_code == 429:
+                        logger.warning(f"DuckDuckGo rate limited, waiting...")
+                        time.sleep(2)
+                        continue
+                except Exception as e:
+                    logger.warning(f"DuckDuckGo endpoint {endpoint['url']} failed: {e}")
+                    if endpoint == endpoints[-1]:  # Last endpoint
+                        raise e
+                    continue
+            else:
+                # All endpoints failed
+                logger.error("All DuckDuckGo endpoints failed")
+                return []
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Multiple selectors for different DDG layouts
+            selectors = [
+                'a.result__a',
+                'a[data-testid="result-title-a"]',
+                '.result__title a',
+                '.web-result a',
+                '.result a',
+                'a[href*="http"]:not([href*="duckduckgo.com"])'
+            ]
+            for selector in selectors:
+                links = soup.select(selector)
+                if links:
+                    logger.info(f"Using selector: {selector} - found {len(links)} links")
+                    break
+            for link in links[:num_results]:
+                try:
+                    href = link.get('href')
+                    if not href or href.startswith('#') or 'duckduckgo.com' in href:
+                        continue
+                    # Clean up DDG redirect URLs
+                    if href.startswith('/l/?uddg='):
+                        import urllib.parse
+                        href = urllib.parse.unquote(href.split('uddg=')[1])
+                    title = link.get_text(strip=True)
+                    if title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': 'duckduckgo_html'
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"DuckDuckGo HTML search failed: {e}")
+            return []
+    def _search_api(self, query: str, num_results: int) -> List[Dict]:
+        """Search using DuckDuckGo Instant Answer API"""
+        try:
+            url = "https://api.duckduckgo.com/"
+            params = {
+                'q': query,
+                'format': 'json',
+                'no_html': '1',
+                'skip_disambig': '1',
+                't': 'MedicalChatbot'
+            }
+            response = self.session.get(url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            data = response.json()
+            results = []
+            # Abstract result
+            if data.get('AbstractURL') and data.get('Abstract'):
+                results.append({
+                    'url': data['AbstractURL'],
+                    'title': data.get('Heading', query),
+                    'content': data.get('Abstract', ''),
+                    'source': 'duckduckgo_api'
+                })
+            # Related topics
+            for topic in data.get('RelatedTopics', []):
+                if len(results) >= num_results:
+                    break
+                if isinstance(topic, dict) and topic.get('FirstURL'):
+                    text = topic.get('Text', '')
+                    title = text.split(' - ')[0] if ' - ' in text else text[:50]
+                    results.append({
+                        'url': topic['FirstURL'],
+                        'title': title,
+                        'content': text,
+                        'source': 'duckduckgo_api'
+                    })
+            return results
+        except Exception as e:
+            logger.warning(f"DuckDuckGo API search failed: {e}")
+            return []
+    def _search_lite(self, query: str, num_results: int) -> List[Dict]:
+        """Search using DuckDuckGo Lite interface"""
+        try:
+            url = "https://lite.duckduckgo.com/lite/"
+            params = {
+                'q': query,
+                'kl': 'us-en'
+            }
+            response = self.session.get(url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Lite interface selectors
+            links = soup.select('a[href*="http"]:not([href*="duckduckgo.com"])')
+            for link in links[:num_results]:
+                try:
+                    href = link.get('href')
+                    title = link.get_text(strip=True)
+                    if href and title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': 'duckduckgo_lite'
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing lite link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"DuckDuckGo Lite search failed: {e}")
+            return []
+    def _fallback_search(self, query: str, num_results: int) -> List[Dict]:
+        """Fallback search using alternative methods when DuckDuckGo fails"""
+        results = []
+        # Try Bing search as fallback
+        try:
+            bing_results = self._search_bing(query, num_results)
+            if bing_results:
+                results.extend(bing_results)
+                logger.info(f"Bing fallback found {len(bing_results)} results")
+        except Exception as e:
+            logger.warning(f"Bing fallback failed: {e}")
+        # Try Startpage search as fallback
+        try:
+            startpage_results = self._search_startpage(query, num_results)
+            if startpage_results:
+                results.extend(startpage_results)
+                logger.info(f"Startpage fallback found {len(startpage_results)} results")
+        except Exception as e:
+            logger.warning(f"Startpage fallback failed: {e}")
+        # Try Searx instances as fallback
+        try:
+            searx_results = self._search_searx(query, num_results)
+            if searx_results:
+                results.extend(searx_results)
+                logger.info(f"Searx fallback found {len(searx_results)} results")
+        except Exception as e:
+            logger.warning(f"Searx fallback failed: {e}")
+        return results
+    def _search_bing(self, query: str, num_results: int) -> List[Dict]:
+        """Search using Bing as fallback"""
+        try:
+            url = "https://www.bing.com/search"
+            params = {
+                'q': query,
+                'count': min(num_results, 50),
+                'first': 1
+            }
+            headers = self.session.headers.copy()
+            headers.update({
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+            })
+            response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Bing result selectors
+            selectors = [
+                'h2 a',
+                '.b_title a',
+                '.b_algo a'
+            ]
+            for selector in selectors:
+                links = soup.select(selector)
+                if links:
+                    logger.info(f"Bing found {len(links)} links with selector: {selector}")
+                    break
+            for link in links[:num_results]:
+                try:
+                    href = link.get('href')
+                    if not href or href.startswith('#') or 'bing.com' in href:
+                        continue
+                    title = link.get_text(strip=True)
+                    if title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': 'bing_fallback'
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing Bing link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Bing search failed: {e}")
+            return []
+    def _search_startpage(self, query: str, num_results: int) -> List[Dict]:
+        """Search using Startpage as fallback"""
+        try:
+            url = "https://www.startpage.com/sp/search"
+            params = {
+                'query': query,
+                'cat': 'web',
+                'pl': 'opensearch'
+            }
+            headers = self.session.headers.copy()
+            headers.update({
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5'
+            })
+            response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Startpage result selectors
+            links = soup.select('a[href*="http"]:not([href*="startpage.com"])')
+            for link in links[:num_results]:
+                try:
+                    href = link.get('href')
+                    if not href or href.startswith('#') or 'startpage.com' in href:
+                        continue
+                    title = link.get_text(strip=True)
+                    if title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': 'startpage_fallback'
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing Startpage link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Startpage search failed: {e}")
+            return []
+    def _search_searx(self, query: str, num_results: int) -> List[Dict]:
+        """Search using public Searx instances as fallback"""
+        searx_instances = [
+            "https://searx.be",
+            "https://searx.tiekoetter.com",
+            "https://searx.xyz"
+        ]
+        for instance in searx_instances:
+            try:
+                url = f"{instance}/search"
+                params = {
+                    'q': query,
+                    'format': 'json'
+                }
+                response = self.session.get(url, params=params, timeout=self.timeout)
+                response.raise_for_status()
+                data = response.json()
+                results = []
+                for result in data.get('results', [])[:num_results]:
+                    try:
+                        url = result.get('url', '')
+                        title = result.get('title', '')
+                        content = result.get('content', '')
+                        if url and title and url.startswith('http'):
+                            results.append({
+                                'url': url,
+                                'title': title,
+                                'content': content,
+                                'source': 'searx_fallback'
+                            })
+                    except Exception as e:
+                        logger.debug(f"Error parsing Searx result: {e}")
+                        continue
+                if results:
+                    logger.info(f"Searx instance {instance} found {len(results)} results")
+                    return results
+            except Exception as e:
+                logger.debug(f"Searx instance {instance} failed: {e}")
+                continue
+            return []

search/engines/multilingual.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import List, Dict, Optional
+import time
+import re
+from urllib.parse import urlparse, quote
+logger = logging.getLogger(__name__)
+class MultilingualCookingEngine:
+    """Multilingual cooking search engine supporting English, Vietnamese, and Chinese sources"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        })
+        self.timeout = timeout
+        # Comprehensive cooking sources by language
+        self.cooking_sources = {
+            'en': {
+                # Major Cooking Sources
+                'allrecipes': {
+                    'base_url': 'https://www.allrecipes.com',
+                    'search_url': 'https://www.allrecipes.com/search',
+                    'domains': ['allrecipes.com'],
+                    'selectors': ['a[href*="/recipe/"]', 'a[href*="/recipes/"]', '.search-result a']
+                },
+                'food_network': {
+                    'base_url': 'https://www.foodnetwork.com',
+                    'search_url': 'https://www.foodnetwork.com/search',
+                    'domains': ['foodnetwork.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                },
+                'epicurious': {
+                    'base_url': 'https://www.epicurious.com',
+                    'search_url': 'https://www.epicurious.com/search',
+                    'domains': ['epicurious.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                },
+                'serious_eats': {
+                    'base_url': 'https://www.seriouseats.com',
+                    'search_url': 'https://www.seriouseats.com/search',
+                    'domains': ['seriouseats.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                },
+                'bon_appetit': {
+                    'base_url': 'https://www.bonappetit.com',
+                    'search_url': 'https://www.bonappetit.com/search',
+                    'domains': ['bonappetit.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                },
+                'taste_of_home': {
+                    'base_url': 'https://www.tasteofhome.com',
+                    'search_url': 'https://www.tasteofhome.com/search',
+                    'domains': ['tasteofhome.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                },
+                'food_com': {
+                    'base_url': 'https://www.food.com',
+                    'search_url': 'https://www.food.com/search',
+                    'domains': ['food.com'],
+                    'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
+                }
+            },
+            'vi': {
+                # Vietnamese Cooking Sources
+                'mon_ngon_viet': {
+                    'base_url': 'https://monngonviet.com',
+                    'search_url': 'https://monngonviet.com/tim-kiem',
+                    'domains': ['monngonviet.com'],
+                    'selectors': ['a[href*="/cong-thuc/"]', 'a[href*="/mon-an/"]', '.search-result a']
+                },
+                'day_phong_cach': {
+                    'base_url': 'https://dayphongcach.vn',
+                    'search_url': 'https://dayphongcach.vn/tim-kiem',
+                    'domains': ['dayphongcach.vn'],
+                    'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
+                },
+                'am_thuc_viet': {
+                    'base_url': 'https://amthucviet.vn',
+                    'search_url': 'https://amthucviet.vn/tim-kiem',
+                    'domains': ['amthucviet.vn'],
+                    'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
+                }
+            },
+            'zh': {
+                # Chinese Cooking Sources
+                'xiachufang': {
+                    'base_url': 'https://www.xiachufang.com',
+                    'search_url': 'https://www.xiachufang.com/search',
+                    'domains': ['xiachufang.com'],
+                    'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
+                },
+                'douguo': {
+                    'base_url': 'https://www.douguo.com',
+                    'search_url': 'https://www.douguo.com/search',
+                    'domains': ['douguo.com'],
+                    'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
+                },
+                'meishij': {
+                    'base_url': 'https://www.meishij.net',
+                    'search_url': 'https://www.meishij.net/search',
+                    'domains': ['meishij.net'],
+                    'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
+                }
+            }
+        }
+    def search(self, query: str, num_results: int = 10, languages: List[str] = None) -> List[Dict]:
+        """Search across multiple languages and cooking sources"""
+        if languages is None:
+            languages = ['en', 'vi', 'zh']
+        all_results = []
+        for lang in languages:
+            if lang in self.cooking_sources:
+                lang_results = self._search_language_sources(query, lang, num_results // len(languages))
+                all_results.extend(lang_results)
+                time.sleep(0.5)  # Rate limiting between languages
+        return all_results[:num_results]
+    def _search_language_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
+        """Search sources for a specific language"""
+        results = []
+        sources = self.cooking_sources.get(language, {})
+        for source_name, source_config in sources.items():
+            if len(results) >= num_results:
+                break
+            source_results = self._search_source(query, source_name, source_config, language)
+            results.extend(source_results)
+            time.sleep(0.3)  # Rate limiting
+        return results
+    def _search_source(self, query: str, source_name: str, source_config: Dict, language: str) -> List[Dict]:
+        """Search a specific cooking source"""
+        try:
+            search_url = source_config.get('search_url')
+            if not search_url:
+                return []
+            params = {
+                'q': query,
+                'query': query,
+                'search': query,
+                'keyword': query
+            }
+            response = self.session.get(search_url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Source-specific selectors
+            selectors = source_config.get('selectors', ['a[href*="http"]'])
+            for selector in selectors:
+                links = soup.select(selector)
+                if links:
+                    logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
+                    break
+            for link in links[:3]:  # Limit per source
+                try:
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    # Make absolute URL
+                    if href.startswith('/'):
+                        href = source_config['base_url'] + href
+                    title = link.get_text(strip=True)
+                    if title and href.startswith('http'):
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'source': source_name,
+                            'domain': source_config['domains'][0],
+                            'language': language
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing {source_name} link: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Cooking source {source_name} ({language}) search failed: {e}")
+            return []
+    def search_by_language(self, query: str, language: str, num_results: int = 10) -> List[Dict]:
+        """Search sources for a specific language only"""
+        if language not in self.cooking_sources:
+            logger.warning(f"Language {language} not supported")
+            return []
+        return self._search_language_sources(query, language, num_results)
+    def _get_fallback_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
+        """Get fallback cooking sources when direct search fails"""
+        fallback_sources = {
+            'en': [
+                {
+                    'url': 'https://www.allrecipes.com/recipes',
+                    'title': f'AllRecipes: {query}',
+                    'source': 'allrecipes_fallback',
+                    'language': 'en',
+                    'domain': 'allrecipes.com'
+                },
+                {
+                    'url': 'https://www.foodnetwork.com/recipes',
+                    'title': f'Food Network: {query}',
+                    'source': 'foodnetwork_fallback',
+                    'language': 'en',
+                    'domain': 'foodnetwork.com'
+                },
+                {
+                    'url': 'https://www.epicurious.com/recipes-menus',
+                    'title': f'Epicurious: {query}',
+                    'source': 'epicurious_fallback',
+                    'language': 'en',
+                    'domain': 'epicurious.com'
+                }
+            ],
+            'vi': [
+                {
+                    'url': 'https://monngonviet.com/cong-thuc',
+                    'title': f'Món Ngon Việt: {query}',
+                    'source': 'monngonviet_fallback',
+                    'language': 'vi',
+                    'domain': 'monngonviet.com'
+                },
+                {
+                    'url': 'https://dayphongcach.vn/mon-an',
+                    'title': f'Dạy Phong Cách: {query}',
+                    'source': 'dayphongcach_fallback',
+                    'language': 'vi',
+                    'domain': 'dayphongcach.vn'
+                }
+            ],
+            'zh': [
+                {
+                    'url': 'https://www.xiachufang.com/recipe',
+                    'title': f'下厨房: {query}',
+                    'source': 'xiachufang_fallback',
+                    'language': 'zh',
+                    'domain': 'xiachufang.com'
+                },
+                {
+                    'url': 'https://www.douguo.com/recipe',
+                    'title': f'豆果: {query}',
+                    'source': 'douguo_fallback',
+                    'language': 'zh',
+                    'domain': 'douguo.com'
+                }
+            ]
+        }
+        return fallback_sources.get(language, [])[:num_results]

search/engines/video.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import List, Dict
+import time
+import re
+from urllib.parse import urlparse, quote
+from models.reranker import MedicalReranker
+logger = logging.getLogger(__name__)
+class VideoSearchEngine:
+    """Search engine for medical videos across multiple platforms"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        })
+        self.timeout = timeout
+        self.reranker = MedicalReranker()
+        # Video platforms by language
+        self.video_platforms = {
+            'en': [
+                {
+                    'name': 'youtube',
+                    'search_url': 'https://www.youtube.com/results',
+                    'params': {'search_query': ''},
+                    'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
+                    'base_url': 'https://www.youtube.com'
+                },
+                {
+                    'name': 'medscape_videos',
+                    'search_url': 'https://www.medscape.com/search',
+                    'params': {'q': ''},
+                    'selectors': ['a[href*="/video/"]', 'a[href*="/viewarticle/"]'],
+                    'base_url': 'https://www.medscape.com'
+                }
+            ],
+            'vi': [
+                {
+                    'name': 'youtube_vi',
+                    'search_url': 'https://www.youtube.com/results',
+                    'params': {'search_query': ''},
+                    'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
+                    'base_url': 'https://www.youtube.com'
+                },
+                {
+                    'name': 'vinmec_videos',
+                    'search_url': 'https://www.vinmec.com/vi/tim-kiem',
+                    'params': {'q': ''},
+                    'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
+                    'base_url': 'https://www.vinmec.com'
+                }
+            ],
+            'zh': [
+                {
+                    'name': 'youtube_zh',
+                    'search_url': 'https://www.youtube.com/results',
+                    'params': {'search_query': ''},
+                    'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
+                    'base_url': 'https://www.youtube.com'
+                },
+                {
+                    'name': 'haodf_videos',
+                    'search_url': 'https://www.haodf.com/search',
+                    'params': {'q': ''},
+                    'selectors': ['a[href*="/video/"]', 'a[href*="/jibing/"]'],
+                    'base_url': 'https://www.haodf.com'
+                }
+            ]
+        }
+    def _normalize_query(self, q: str) -> str:
+        if not q:
+            return ""
+        q = q.strip()
+        q = re.sub(r"^(en|vi|zh)\s*:\s*", "", q, flags=re.IGNORECASE)
+        # Remove bullet points and special characters
+        q = re.sub(r'[•·▪▫‣⁃]', ' ', q)
+        q = re.sub(r'[^\w\s\-\.]', ' ', q)
+        q = re.sub(r"\s+", " ", q)
+        return q.strip()
+    def _is_valid_medical_video(self, result: Dict, query: str) -> bool:
+        """Check if video is medically relevant and has valid URL"""
+        url = result.get('url', '')
+        title = result.get('title', '')
+        # Skip generic YouTube search result pages
+        if 'results?search_query=' in url:
+            return False
+        # Skip non-YouTube URLs that aren't medical platforms
+        if 'youtube.com' not in url and not any(med in url for med in ['medscape.com', 'vinmec.com', 'haodf.com']):
+            return False
+        # Check if title contains medical keywords or query terms
+        title_lower = title.lower()
+        query_lower = query.lower()
+        medical_keywords = [
+            'medical', 'health', 'doctor', 'treatment', 'diagnosis',
+            'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
+            'disease', 'condition', 'healthcare', 'physician'
+        ]
+        # Must contain medical keywords or query terms
+        has_medical = any(keyword in title_lower for keyword in medical_keywords)
+        has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
+        return has_medical or has_query
+    def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
+        """Search platform with retry logic and better error handling"""
+        for attempt in range(max_retries):
+            try:
+                return self._search_platform(query, platform, num_results)
+            except Exception as e:
+                logger.warning(f"Attempt {attempt + 1} failed for {platform['name']}: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(1)  # Wait before retry
+                else:
+                    logger.error(f"All attempts failed for {platform['name']}")
+        return []
+    def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
+        """Search for medical videos across platforms with deduplication and medical filtering"""
+        query = self._normalize_query(query)
+        logger.info(f"Searching for medical videos: {query} (language: {language})")
+        results = []
+        seen_urls = set()  # Track URLs to avoid duplicates
+        seen_video_ids = set()  # Track video IDs to avoid duplicates
+        platforms = self.video_platforms.get(language, self.video_platforms['en'])
+        # Try platforms in order of reliability
+        for platform in platforms:
+            if len(results) >= num_results:
+                break
+            try:
+                # Add timeout and retry logic
+                platform_results = self._search_platform_with_retry(query, platform, num_results * 3)
+                if not platform_results:
+                    logger.warning(f"No results from {platform['name']}")
+                    continue
+                # Filter out duplicates and non-medical content
+                for result in platform_results:
+                    url = result.get('url', '')
+                    video_id = self._extract_video_id(url)
+                    # Skip if URL or video ID already seen
+                    if url in seen_urls or (video_id and video_id in seen_video_ids):
+                        continue
+                    # Check if it's a valid medical video (less strict for more results)
+                    if self._is_valid_medical_video(result, query):
+                        seen_urls.add(url)
+                        if video_id:
+                            seen_video_ids.add(video_id)
+                        # Normalize YouTube URLs
+                        if video_id and 'youtube.com' in url:
+                            result['url'] = f"https://www.youtube.com/watch?v={video_id}"
+                            result['video_id'] = video_id
+                        results.append(result)
+                        if len(results) >= num_results:
+                            break
+                time.sleep(0.5)  # Rate limiting
+            except Exception as e:
+                logger.warning(f"Video search failed for {platform['name']}: {e}")
+                continue
+        # Add fallback video sources if needed
+        if len(results) < num_results:
+            # Try resilient YouTube via Invidious API
+            try:
+                resilient = self._search_youtube_invidious(query, language, num_results - len(results))
+                for result in resilient:
+                    url = result.get('url', '')
+                    video_id = result.get('video_id', '')
+                    if (url not in seen_urls and
+                        video_id not in seen_video_ids and
+                        self._is_valid_medical_video(result, query)):
+                        seen_urls.add(url)
+                        if video_id:
+                            seen_video_ids.add(video_id)
+                        results.append(result)
+                        if len(results) >= num_results:
+                            break
+            except Exception as e:
+                logger.warning(f"Invidious fallback failed: {e}")
+            # If still no results, try generic video search fallback
+            if len(results) < num_results:
+                try:
+                    fallback_results = self._get_fallback_videos(query, language, num_results - len(results))
+                    for result in fallback_results:
+                        if result['url'] not in seen_urls:
+                            seen_urls.add(result['url'])
+                            results.append(result)
+                            if len(results) >= num_results:
+                                break
+                    logger.info(f"Added {len(fallback_results)} fallback video results")
+                except Exception as e:
+                    logger.warning(f"Fallback video search failed: {e}")
+        # Use reranker to improve quality and relevance
+        if results:
+            reranked_results = self.reranker.filter_youtube_results(results, query)
+            logger.info(f"Reranked {len(results)} video results to {len(reranked_results)} high-quality results")
+            return reranked_results[:num_results]
+        logger.info(f"Found {len(results)} medical video results")
+        return results[:num_results]
+    def _search_platform(self, query: str, platform: Dict, num_results: int) -> List[Dict]:
+        """Search a specific video platform with improved error handling"""
+        try:
+            search_url = platform['search_url']
+            params = platform['params'].copy()
+            # Set search query parameter
+            for param_name in params.keys():
+                params[param_name] = query
+            # Add headers to avoid blocking
+            headers = self.session.headers.copy()
+            headers.update({
+                'Referer': 'https://www.google.com/',
+                'Cache-Control': 'no-cache',
+            })
+            # Try with shorter timeout first
+            response = self.session.get(search_url, params=params, headers=headers, timeout=10)
+            # Check for common error responses
+            if response.status_code == 404:
+                logger.warning(f"Platform {platform['name']} returned 404 - endpoint may have changed")
+                return []
+            elif response.status_code == 403:
+                logger.warning(f"Platform {platform['name']} returned 403 - may be blocking requests")
+                return []
+            elif response.status_code >= 400:
+                logger.warning(f"Platform {platform['name']} returned {response.status_code}")
+                return []
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Try platform-specific selectors
+            selectors = platform.get('selectors', ['a[href*="video"]', 'a[href*="watch"]'])
+            links = []
+            for selector in selectors:
+                links = soup.select(selector)
+                if links:
+                    logger.info(f"{platform['name']} found {len(links)} video links with selector: {selector}")
+                    break
+            # If no links found, try generic selectors
+            if not links:
+                generic_selectors = ['a[href*="http"]', 'a[href*="www"]']
+                for selector in generic_selectors:
+                    links = soup.select(selector)
+                    if links:
+                        logger.info(f"{platform['name']} found {len(links)} generic links with selector: {selector}")
+                        break
+            for link in links[:num_results]:
+                try:
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    # Make absolute URL
+                    if href.startswith('/'):
+                        href = platform['base_url'] + href
+                    # Skip if not a valid URL
+                    if not href.startswith('http'):
+                        continue
+                    title = link.get_text(strip=True) or platform['name']
+                    if title and href:
+                        results.append({
+                            'url': href,
+                            'title': title,
+                            'platform': platform['name'],
+                            'type': 'video',
+                            'source': platform['name']
+                        })
+                except Exception as e:
+                    logger.debug(f"Error parsing {platform['name']} link: {e}")
+                    continue
+            return results
+        except requests.exceptions.Timeout:
+            logger.warning(f"Platform {platform['name']} search timed out")
+            return []
+        except requests.exceptions.ConnectionError:
+            logger.warning(f"Platform {platform['name']} connection failed - network issue")
+            return []
+        except Exception as e:
+            logger.warning(f"Platform {platform['name']} search failed: {e}")
+            return []
+    def _search_youtube_invidious(self, query: str, language: str, needed: int) -> List[Dict]:
+        """Search YouTube via public Invidious instances (no API key)."""
+        if needed <= 0:
+            return []
+        instances = [
+            "https://yewtu.be",
+            "https://invidious.flokinet.to",
+            "https://vid.puffyan.us",
+            "https://iv.ggtyler.dev"
+        ]
+        out: List[Dict] = []
+        q = quote(query)
+        for base in instances:
+            if len(out) >= needed:
+                break
+            try:
+                url = f"{base}/api/v1/search?q={q}&region={'VN' if language=='vi' else 'US'}&fields=title,videoId,author&type=video"
+                r = self.session.get(url, timeout=6)
+                r.raise_for_status()
+                data = r.json()
+                for item in data:
+                    if len(out) >= needed:
+                        break
+                    vid = item.get("videoId")
+                    title = (item.get("title") or "").strip()
+                    if not vid or not title:
+                        continue
+                    out.append({
+                        'url': f"https://www.youtube.com/watch?v={vid}",
+                        'title': title,
+                        'thumbnail': f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg",
+                        'platform': 'youtube',
+                        'source': 'youtube',
+                        'type': 'video',
+                        'language': language
+                    })
+            except Exception as e:
+                logger.debug(f"Invidious {base} failed: {e}")
+                continue
+        return out
+    def _get_fallback_videos(self, query: str, language: str, num_results: int) -> List[Dict]:
+        """Get fallback video sources when direct search fails"""
+        fallback_videos = {
+            'en': [
+                {
+                    'url': 'https://www.youtube.com/results?search_query=medical+' + quote(query),
+                    'title': f'Medical Videos: {query}',
+                    'platform': 'youtube_fallback',
+                    'type': 'video',
+                    'source': 'youtube'
+                },
+                {
+                    'url': 'https://www.medscape.com/search?q=' + quote(query),
+                    'title': f'Medscape Videos: {query}',
+                    'platform': 'medscape_fallback',
+                    'type': 'video',
+                    'source': 'medscape'
+                }
+            ],
+            'vi': [
+                {
+                    'url': 'https://www.youtube.com/results?search_query=y+tế+' + quote(query),
+                    'title': f'Video Y Tế: {query}',
+                    'platform': 'youtube_vi_fallback',
+                    'type': 'video',
+                    'source': 'youtube'
+                },
+                {
+                    'url': 'https://www.vinmec.com/vi/suc-khoe',
+                    'title': f'Vinmec Videos: {query}',
+                    'platform': 'vinmec_fallback',
+                    'type': 'video',
+                    'source': 'vinmec'
+                }
+            ],
+            'zh': [
+                {
+                    'url': 'https://www.youtube.com/results?search_query=医疗+' + quote(query),
+                    'title': f'医疗视频: {query}',
+                    'platform': 'youtube_zh_fallback',
+                    'type': 'video',
+                    'source': 'youtube'
+                },
+                {
+                    'url': 'https://www.haodf.com/jibing',
+                    'title': f'好大夫视频: {query}',
+                    'platform': 'haodf_fallback',
+                    'type': 'video',
+                    'source': 'haodf'
+                }
+            ]
+        }
+        return fallback_videos.get(language, fallback_videos['en'])[:num_results]
+    def _extract_video_id(self, url: str) -> str:
+        """Extract YouTube video ID from URL"""
+        patterns = [
+            r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
+            r'(?:embed\/)([0-9A-Za-z_-]{11})',
+            r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, url)
+            if match:
+                return match.group(1)
+        return None

search/extractors/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .content import ContentExtractor
2	+
3	+ __all__ = ['ContentExtractor']

search/extractors/content.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import Dict, Optional
+import re
+from urllib.parse import urlparse
+import time
+logger = logging.getLogger(__name__)
+class ContentExtractor:
+    """Extract and clean content from web pages"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        })
+        self.timeout = timeout
+        # Medical content indicators
+        self.medical_indicators = [
+            'symptom', 'treatment', 'diagnosis', 'medicine', 'medication',
+            'therapy', 'condition', 'disease', 'health', 'medical',
+            'doctor', 'physician', 'patient', 'clinical', 'study'
+        ]
+    def extract(self, url: str, max_length: int = 2000) -> Optional[str]:
+        """Extract content from a URL with medical focus"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove unwanted elements
+            self._remove_unwanted_elements(soup)
+            # Extract main content
+            content = self._extract_main_content(soup)
+            if not content:
+                return None
+            # Clean and process content
+            cleaned_content = self._clean_content(content)
+            # Focus on medical content if possible
+            medical_content = self._extract_medical_content(cleaned_content)
+            # Truncate to max length
+            final_content = self._truncate_content(medical_content or cleaned_content, max_length)
+            return final_content if final_content else None
+        except Exception as e:
+            logger.warning(f"Content extraction failed for {url}: {e}")
+            return None
+    def _remove_unwanted_elements(self, soup: BeautifulSoup):
+        """Remove unwanted HTML elements"""
+        unwanted_tags = [
+            'script', 'style', 'nav', 'header', 'footer', 'aside',
+            'advertisement', 'ads', 'sidebar', 'menu', 'navigation',
+            'social', 'share', 'comment', 'comments', 'related',
+            'cookie', 'privacy', 'terms', 'disclaimer'
+        ]
+        for tag in unwanted_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+        # Remove elements with unwanted classes/ids
+        unwanted_selectors = [
+            '[class*="ad"]', '[class*="advertisement"]', '[class*="sidebar"]',
+            '[class*="menu"]', '[class*="nav"]', '[class*="social"]',
+            '[class*="share"]', '[class*="comment"]', '[class*="related"]',
+            '[id*="ad"]', '[id*="sidebar"]', '[id*="menu"]', '[id*="nav"]'
+        ]
+        for selector in unwanted_selectors:
+            for element in soup.select(selector):
+                element.decompose()
+    def _extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from the page"""
+        # Priority order for content extraction
+        content_selectors = [
+            'article',
+            'main',
+            '[role="main"]',
+            '.content',
+            '.main-content',
+            '.article-content',
+            '.post-content',
+            '.entry-content',
+            '.page-content',
+            'body'
+        ]
+        for selector in content_selectors:
+            elements = soup.select(selector)
+            if elements:
+                # Get the largest content element
+                largest_element = max(elements, key=lambda x: len(x.get_text()))
+                content = largest_element.get_text(separator=' ', strip=True)
+                if len(content) > 100:  # Minimum content length
+                    return content
+        # Fallback: get all text
+        return soup.get_text(separator=' ', strip=True)
+    def _clean_content(self, content: str) -> str:
+        """Clean and normalize content"""
+        if not content:
+            return ""
+        # Remove excessive whitespace
+        content = re.sub(r'\s+', ' ', content)
+        # Remove common web artifacts
+        artifacts = [
+            r'Cookie\s+Policy',
+            r'Privacy\s+Policy',
+            r'Terms\s+of\s+Service',
+            r'Subscribe\s+to\s+our\s+newsletter',
+            r'Follow\s+us\s+on',
+            r'Share\s+this\s+article',
+            r'Related\s+articles',
+            r'Advertisement',
+            r'Ad\s+content'
+        ]
+        for artifact in artifacts:
+            content = re.sub(artifact, '', content, flags=re.IGNORECASE)
+        # Remove excessive punctuation
+        content = re.sub(r'[.]{3,}', '...', content)
+        content = re.sub(r'[!]{2,}', '!', content)
+        content = re.sub(r'[?]{2,}', '?', content)
+        return content.strip()
+    def _extract_medical_content(self, content: str) -> Optional[str]:
+        """Extract medical-focused content from the text"""
+        if not content:
+            return None
+        # Split content into sentences
+        sentences = re.split(r'[.!?]+', content)
+        medical_sentences = []
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 20:  # Skip very short sentences
+                continue
+            # Check if sentence contains medical indicators
+            sentence_lower = sentence.lower()
+            if any(indicator in sentence_lower for indicator in self.medical_indicators):
+                medical_sentences.append(sentence)
+        if medical_sentences:
+            # Return medical sentences, prioritizing longer ones
+            medical_sentences.sort(key=len, reverse=True)
+            return '. '.join(medical_sentences[:10]) + '.'
+        return None
+    def _truncate_content(self, content: str, max_length: int) -> str:
+        """Truncate content to max length while preserving sentences"""
+        if len(content) <= max_length:
+            return content
+        # Try to truncate at sentence boundary
+        truncated = content[:max_length]
+        last_period = truncated.rfind('.')
+        last_exclamation = truncated.rfind('!')
+        last_question = truncated.rfind('?')
+        last_sentence_end = max(last_period, last_exclamation, last_question)
+        if last_sentence_end > max_length * 0.7:  # If we can find a good break point
+            return content[:last_sentence_end + 1]
+        # Fallback: truncate at word boundary
+        words = truncated.split()
+        if len(words) > 1:
+            return ' '.join(words[:-1]) + '...'
+        return truncated + '...'
+    def extract_multiple(self, urls: list, max_length: int = 2000) -> Dict[str, str]:
+        """Extract content from multiple URLs"""
+        results = {}
+        for url in urls:
+            try:
+                content = self.extract(url, max_length)
+                if content:
+                    results[url] = content
+                time.sleep(0.5)  # Be respectful to servers
+            except Exception as e:
+                logger.warning(f"Failed to extract content from {url}: {e}")
+                continue
+        return results

search/processors/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .medical import MedicalSearchProcessor
+from .language import LanguageProcessor
+from .sources import SourceAggregator
+from .enhanced import EnhancedContentProcessor
+__all__ = ['MedicalSearchProcessor', 'LanguageProcessor', 'SourceAggregator', 'EnhancedContentProcessor']

search/processors/cooking.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import logging
+from typing import List, Dict, Tuple
+from models.summarizer import summarizer
+import re
+logger = logging.getLogger(__name__)
+class CookingSearchProcessor:
+    """Process and enhance cooking search results"""
+    def __init__(self):
+        self.cooking_keywords = [
+            'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
+            'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
+            'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
+            'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
+            'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
+            'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
+            'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
+            'substitution', 'alternative', 'variation', 'modification', 'adaptation',
+            'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
+        ]
+    def process_results(self, results: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
+        """Process search results and create comprehensive cooking summary"""
+        if not results:
+            return "", {}
+        # Filter and rank results by cooking relevance
+        relevant_results = self._filter_cooking_results(results, user_query)
+        if not relevant_results:
+            logger.warning("No cooking-relevant results found")
+            return "", {}
+        # Extract and summarize content
+        summarized_results = self._summarize_results(relevant_results, user_query)
+        # Create comprehensive summary
+        combined_summary = self._create_combined_summary(summarized_results, user_query)
+        # Create URL mapping for citations
+        url_mapping = self._create_url_mapping(relevant_results)
+        return combined_summary, url_mapping
+    def _filter_cooking_results(self, results: List[Dict], user_query: str) -> List[Dict]:
+        """Filter results by cooking relevance"""
+        relevant_results = []
+        for result in results:
+            relevance_score = self._calculate_relevance_score(result, user_query)
+            if relevance_score > 0.3:  # Threshold for cooking relevance
+                result['relevance_score'] = relevance_score
+                relevant_results.append(result)
+        # Sort by relevance score
+        relevant_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
+        # Limit to top results
+        return relevant_results[:10]
+    def _calculate_relevance_score(self, result: Dict, user_query: str) -> float:
+        """Calculate cooking relevance score for a result"""
+        score = 0.0
+        # Check title relevance
+        title = result.get('title', '').lower()
+        query_lower = user_query.lower()
+        # Direct query match in title
+        if any(word in title for word in query_lower.split()):
+            score += 0.4
+        # Cooking keyword match in title
+        cooking_matches = sum(1 for keyword in self.cooking_keywords if keyword in title)
+        score += min(cooking_matches * 0.1, 0.3)
+        # Domain credibility for cooking sources
+        url = result.get('url', '').lower()
+        credible_domains = [
+            'allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com',
+            'bonappetit.com', 'cooking.nytimes.com', 'tasteofhome.com', 'food.com',
+            'bbcgoodfood.com', 'jamieoliver.com', 'gordonramsay.com', 'marthastewart.com',
+            'kingarthurbaking.com', 'sallysbakingaddiction.com', 'smittenkitchen.com'
+        ]
+        if any(domain in url for domain in credible_domains):
+            score += 0.3
+        # Source type bonus for cooking
+        source = result.get('source', '')
+        if 'cooking' in source or 'recipe' in source or any(domain in source for domain in credible_domains):
+            score += 0.2
+        return min(score, 1.0)
+    def _summarize_results(self, results: List[Dict], user_query: str) -> List[Dict]:
+        """Summarize content from search results"""
+        summarized_results = []
+        for i, result in enumerate(results):
+            try:
+                content = result.get('content', '')
+                if not content:
+                    continue
+                # Create focused summary
+                summary = summarizer.summarize_for_query(content, user_query, max_length=300)
+                if summary:
+                    summarized_results.append({
+                        'id': i + 1,
+                        'url': result['url'],
+                        'title': result['title'],
+                        'summary': summary,
+                        'relevance_score': result.get('relevance_score', 0)
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to summarize result {i}: {e}")
+                continue
+        return summarized_results
+    def _create_combined_summary(self, summarized_results: List[Dict], user_query: str) -> str:
+        """Create a comprehensive summary from all results with proper source attribution"""
+        if not summarized_results:
+            return ""
+        logger.info(f"Creating combined summary from {len(summarized_results)} results")
+        # Group by topic/similarity
+        topic_groups = self._group_by_topic(summarized_results)
+        summary_parts = []
+        citation_counter = 1
+        for topic, results in topic_groups.items():
+            if not results:
+                continue
+            logger.info(f"Processing {topic} topic with {len(results)} results")
+            # Create topic summary with source attribution
+            topic_summary = self._create_topic_summary(topic, results, user_query, citation_counter)
+            if topic_summary:
+                summary_parts.append(topic_summary)
+                # Update citation counter for next topic
+                citation_counter += len([r for r in results if r.get('summary')])
+        # Combine all parts
+        combined_summary = "\n\n".join(summary_parts)
+        # Don't over-summarize - keep source attribution intact
+        if len(combined_summary) > 2000:
+            # Only truncate if absolutely necessary, but preserve structure
+            lines = combined_summary.split('\n')
+            truncated_lines = []
+            current_length = 0
+            for line in lines:
+                if current_length + len(line) > 2000:
+                    break
+                truncated_lines.append(line)
+                current_length += len(line)
+            combined_summary = '\n'.join(truncated_lines)
+            if len(truncated_lines) < len(lines):
+                combined_summary += "\n\n*[Additional information available from multiple sources]*"
+        logger.info(f"Final combined summary length: {len(combined_summary)} characters")
+        return combined_summary
+    def _group_by_topic(self, results: List[Dict]) -> Dict[str, List[Dict]]:
+        """Group results by cooking topic"""
+        topics = {
+            'recipes': [],
+            'techniques': [],
+            'ingredients': [],
+            'general': []
+        }
+        for result in results:
+            title_lower = result['title'].lower()
+            summary_lower = result.get('summary', '').lower()
+            content_lower = f"{title_lower} {summary_lower}"
+            # Categorize by content
+            if any(word in content_lower for word in ['recipe', 'ingredients', 'instructions', 'steps']):
+                topics['recipes'].append(result)
+            elif any(word in content_lower for word in ['technique', 'method', 'how to', 'cooking']):
+                topics['techniques'].append(result)
+            elif any(word in content_lower for word in ['ingredients', 'substitution', 'alternative', 'variation']):
+                topics['ingredients'].append(result)
+            else:
+                topics['general'].append(result)
+        return topics
+    def _create_topic_summary(self, topic: str, results: List[Dict], user_query: str, citation_start: int = 1) -> str:
+        """Create summary for a specific topic with source attribution"""
+        if not results:
+            return ""
+        # Add topic header
+        topic_headers = {
+            'recipes': "**Recipes and Instructions:**",
+            'techniques': "**Cooking Techniques:**",
+            'ingredients': "**Ingredients and Substitutions:**",
+            'general': "**General Information:**"
+        }
+        header = topic_headers.get(topic, "**Information:**")
+        summary_parts = [header]
+        # Process each result individually to maintain source attribution
+        for i, result in enumerate(results[:3]):  # Limit to top 3 per topic
+            summary = result.get('summary', '')
+            if not summary:
+                continue
+            # Extract domain from URL for source attribution
+            url = result.get('url', '')
+            domain = self._extract_domain(url)
+            # Use proper citation number
+            citation_num = citation_start + i
+            # Add source attribution
+            summary_with_source = f"* {summary} <#{citation_num}>"
+            summary_parts.append(summary_with_source)
+        return "\n".join(summary_parts)
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain name from URL"""
+        try:
+            from urllib.parse import urlparse
+            parsed = urlparse(url)
+            domain = parsed.netloc.lower()
+            # Remove www. prefix
+            if domain.startswith('www.'):
+                domain = domain[4:]
+            return domain
+        except:
+            return ""
+    def _create_url_mapping(self, results: List[Dict]) -> Dict[int, str]:
+        """Create URL mapping for citations"""
+        url_mapping = {}
+        for i, result in enumerate(results):
+            url_mapping[i + 1] = result['url']
+        logger.info(f"Created URL mapping for {len(url_mapping)} sources")
+        return url_mapping

search/processors/enhanced.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import logging
+from typing import List, Dict, Tuple, Set
+import re
+from collections import defaultdict
+from models.summarizer import summarizer
+logger = logging.getLogger(__name__)
+class EnhancedContentProcessor:
+    """Enhanced content processing for maximum information extraction"""
+    def __init__(self):
+        # Cooking content patterns for extraction
+        self.cooking_patterns = {
+            'ingredients': [
+                r'ingredients?\s+(?:include|are|may include|can include)',
+                r'you\s+need',
+                r'required\s+ingredients?',
+                r'main\s+ingredients?',
+                r'key\s+ingredients?'
+            ],
+            'techniques': [
+                r'techniques?\s+(?:include|are|may include|can include)',
+                r'cooking\s+methods?',
+                r'preparation\s+methods?',
+                r'how\s+to\s+cook',
+                r'cooking\s+process'
+            ],
+            'instructions': [
+                r'instructions?\s+(?:include|are|may include)',
+                r'steps?\s+(?:include|are|may include)',
+                r'how\s+to\s+make',
+                r'preparation\s+steps?',
+                r'cooking\s+steps?'
+            ],
+            'timing': [
+                r'timing\s+(?:include|are|may include)',
+                r'cooking\s+time',
+                r'preparation\s+time',
+                r'total\s+time',
+                r'duration'
+            ],
+            'tips': [
+                r'tips?\s+(?:include|are|may include)',
+                r'advice\s+(?:include|are|may include)',
+                r'recommendations?',
+                r'helpful\s+hints?',
+                r'secrets?'
+            ],
+            'variations': [
+                r'variations?\s+(?:include|are|may include)',
+                r'substitutions?\s+(?:include|are|may include)',
+                r'alternatives?\s+(?:include|are|may include)',
+                r'modifications?\s+(?:include|are|may include)',
+                r'complications?'
+            ]
+        }
+        # Content quality indicators
+        self.quality_indicators = {
+            'high': [
+                'professional chef', 'culinary institute', 'food science', 'nutrition research',
+                'evidence-based', 'peer-reviewed', 'published study', 'research shows',
+                'culinary guidelines', 'chef consensus', 'expert opinion'
+            ],
+            'medium': [
+                'studies show', 'research indicates', 'culinary literature',
+                'professional experience', 'case studies', 'observational studies'
+            ],
+            'low': [
+                'some people', 'may help', 'could be', 'might work',
+                'anecdotal', 'personal experience', 'unverified'
+            ]
+        }
+    def process_comprehensive_content(self, sources: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
+        """Process all sources to extract maximum relevant information"""
+        if not sources:
+            return "", {}
+        logger.info(f"Processing {len(sources)} sources for comprehensive information extraction")
+        # Extract structured information from each source
+        structured_info = self._extract_structured_information(sources, user_query)
+        # Create comprehensive summary
+        comprehensive_summary = self._create_comprehensive_summary(structured_info, user_query)
+        # Create detailed reference mapping
+        reference_mapping = self._create_detailed_reference_mapping(sources)
+        return comprehensive_summary, reference_mapping
+    def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
+        """Extract structured information by medical categories"""
+        structured_info = defaultdict(list)
+        for source in sources:
+            content = source.get('content', '')
+            if not content:
+                continue
+            # Extract information by medical categories
+            for category, patterns in self.medical_patterns.items():
+                extracted_info = self._extract_category_info(content, patterns, category, user_query)
+                if extracted_info:
+                    structured_info[category].append({
+                        'content': extracted_info,
+                        'source': source,
+                        'relevance_score': self._calculate_relevance_score(extracted_info, user_query)
+                    })
+        # Sort by relevance within each category
+        for category in structured_info:
+            structured_info[category].sort(key=lambda x: x['relevance_score'], reverse=True)
+        return dict(structured_info)
+    def _extract_category_info(self, content: str, patterns: List[str], category: str, user_query: str) -> str:
+        """Extract information for a specific cooking category"""
+        extracted_sentences = []
+        # Split content into sentences
+        sentences = re.split(r'[.!?]+', content)
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 20:  # Skip very short sentences
+                continue
+            # Check if sentence matches any pattern for this category
+            for pattern in patterns:
+                if re.search(pattern, sentence, re.IGNORECASE):
+                    # Check relevance to user query
+                    if self._is_relevant_to_query(sentence, user_query):
+                        extracted_sentences.append(sentence)
+                        break
+        # Combine and summarize extracted sentences
+        if extracted_sentences:
+            combined_text = '. '.join(extracted_sentences[:5])  # Limit to top 5 sentences
+            return summarizer.summarize_for_query(combined_text, user_query, max_length=300)
+        return ""
+    def _is_relevant_to_query(self, sentence: str, user_query: str) -> bool:
+        """Check if sentence is relevant to user query"""
+        query_words = set(user_query.lower().split())
+        sentence_words = set(sentence.lower().split())
+        # Calculate word overlap
+        overlap = len(query_words.intersection(sentence_words))
+        return overlap >= 2  # At least 2 words in common
+    def _calculate_relevance_score(self, content: str, user_query: str) -> float:
+        """Calculate relevance score for content"""
+        if not content or not user_query:
+            return 0.0
+        query_words = set(user_query.lower().split())
+        content_words = set(content.lower().split())
+        # Word overlap score
+        overlap = len(query_words.intersection(content_words))
+        overlap_score = overlap / len(query_words) if query_words else 0
+        # Content quality score
+        quality_score = self._assess_content_quality(content)
+        # Length score (prefer medium-length content)
+        length_score = min(len(content) / 500, 1.0)  # Normalize to 0-1
+        # Composite score
+        composite_score = (
+            overlap_score * 0.5 +    # 50% relevance to query
+            quality_score * 0.3 +    # 30% content quality
+            length_score * 0.2       # 20% appropriate length
+        )
+        return min(composite_score, 1.0)
+    def _assess_content_quality(self, content: str) -> float:
+        """Assess content quality based on cooking indicators"""
+        content_lower = content.lower()
+        high_indicators = sum(1 for indicator in self.quality_indicators['high'] if indicator in content_lower)
+        medium_indicators = sum(1 for indicator in self.quality_indicators['medium'] if indicator in content_lower)
+        low_indicators = sum(1 for indicator in self.quality_indicators['low'] if indicator in content_lower)
+        # Calculate quality score
+        if high_indicators > 0:
+            return 0.9
+        elif medium_indicators > 0:
+            return 0.7
+        elif low_indicators > 0:
+            return 0.5
+        else:
+            return 0.6  # Default score for neutral content
+    def _create_comprehensive_summary(self, structured_info: Dict[str, List[Dict]], user_query: str) -> str:
+        """Create comprehensive summary from structured information"""
+        if not structured_info:
+            return ""
+        summary_parts = []
+        # Process each category
+        category_headers = {
+            'ingredients': "**🥘 Ingredients & Shopping:**",
+            'techniques': "**👨‍🍳 Cooking Techniques:**",
+            'instructions': "**📋 Step-by-Step Instructions:**",
+            'timing': "**⏰ Timing & Preparation:**",
+            'tips': "**💡 Pro Tips & Tricks:**",
+            'variations': "**🔄 Variations & Substitutions:**"
+        }
+        for category, info_list in structured_info.items():
+            if not info_list:
+                continue
+            # Take top 2 most relevant items for each category
+            top_items = info_list[:2]
+            category_content = []
+            for item in top_items:
+                content = item['content']
+                if content:
+                    category_content.append(content)
+            if category_content:
+                # Combine and summarize category content
+                combined_content = ' '.join(category_content)
+                category_summary = summarizer.summarize_for_query(combined_content, user_query, max_length=400)
+                if category_summary:
+                    header = category_headers.get(category, f"**{category.title()}:**")
+                    summary_parts.append(f"{header}\n{category_summary}")
+        # Combine all parts
+        comprehensive_summary = "\n\n".join(summary_parts)
+        # Final summarization to ensure conciseness
+        if len(comprehensive_summary) > 2000:
+            comprehensive_summary = summarizer.summarize_text(comprehensive_summary, max_length=2000)
+        return comprehensive_summary
+    def _create_detailed_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
+        """Create detailed reference mapping with source metadata"""
+        reference_mapping = {}
+        for i, source in enumerate(sources, 1):
+            # Be defensive: some upstream sources may miss optional fields
+            reference_mapping[i] = {
+                'url': source.get('url', ''),
+                'title': source.get('title', ''),
+                'domain': source.get('domain', ''),
+                'source_type': source.get('source_type', 'text'),
+                'language': source.get('language', 'en'),
+                'type': source.get('type', 'text'),
+                'content_length': len(source.get('content', '')),
+                'composite_score': source.get('composite_score', 0.7)
+            }
+        return reference_mapping
+    def create_inline_citations(self, text: str, reference_mapping: Dict[int, Dict]) -> str:
+        """Create inline citations within the text"""
+        if not reference_mapping:
+            return text
+        # Find places where citations should be added
+        # This is a simplified version - in practice, you'd use more sophisticated NLP
+        # Add citations after key cooking statements
+        citation_patterns = [
+            r'(ingredients?\s+(?:include|are)[^.]*\.)',
+            r'(techniques?\s+(?:include|are)[^.]*\.)',
+            r'(instructions?\s+(?:include|are)[^.]*\.)',
+            r'(timing\s+(?:include|are)[^.]*\.)',
+            r'(studies?\s+show[^.]*\.)',
+            r'(research\s+(?:indicates|shows)[^.]*\.)'
+        ]
+        cited_text = text
+        citation_count = 1
+        for pattern in citation_patterns:
+            matches = re.finditer(pattern, cited_text, re.IGNORECASE)
+            for match in matches:
+                if citation_count <= len(reference_mapping):
+                    citation_tag = f" <#{citation_count}>"
+                    cited_text = cited_text.replace(match.group(1), match.group(1) + citation_tag, 1)
+                    citation_count += 1
+        return cited_text
+    def generate_source_statistics(self, sources: List[Dict]) -> str:
+        """Generate statistics about sources used"""
+        if not sources:
+            return ""
+        total_sources = len(sources)
+        # credibility removed
+        # Language distribution
+        languages = defaultdict(int)
+        for source in sources:
+            lang = source.get('language', 'en')
+            languages[lang] += 1
+        # Source type distribution
+        source_types = defaultdict(int)
+        for source in sources:
+            source_type = source.get('source_type', 'other')
+            source_types[source_type] += 1
+        # Content length statistics
+        content_lengths = [len(s.get('content', '')) for s in sources]
+        avg_content_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
+        stats_parts = []
+        stats_parts.append(f"**📊 Source Statistics:**")
+        stats_parts.append(f"• **Total Sources**: {total_sources}")
+        # removed credibility summary
+        stats_parts.append(f"• **Languages**: {', '.join([f'{count} {lang}' for lang, count in languages.items()])}")
+        stats_parts.append(f"• **Types**: {', '.join([f'{count} {type_name}' for type_name, count in source_types.items()])}")
+        stats_parts.append(f"• **Avg Content Length**: {avg_content_length:.0f} characters")
+        return "\n".join(stats_parts)

search/processors/language.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import re
+import logging
+from typing import List, Dict, Tuple, Optional
+from langdetect import detect, DetectorFactory
+from langdetect.lang_detect_exception import LangDetectException
+logger = logging.getLogger(__name__)
+# Set seed for consistent language detection
+DetectorFactory.seed = 0
+class LanguageProcessor:
+    """Process and enhance queries for multilingual medical search"""
+    def __init__(self):
+        # Medical keywords in different languages
+        self.medical_keywords = {
+            'en': [
+                'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
+                'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
+                'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
+                'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
+                'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
+                'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
+                'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
+                'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
+                'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
+            ],
+            'vi': [
+                'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
+                'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
+                'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
+                'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
+                'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
+                'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
+                'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
+                'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
+                'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
+            ],
+            'zh': [
+                '症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
+                '药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
+                '医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
+                '背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
+                '睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
+                '婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
+                '手术', '医院', '诊所'
+            ]
+        }
+        # Language-specific search enhancements
+        self.language_enhancements = {
+            'vi': {
+                'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
+                'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
+            },
+            'zh': {
+                'common_terms': ['是什么', '原因', '治疗方法', '症状'],
+                'medical_context': ['医疗', '健康', '医院', '医生']
+            },
+            'en': {
+                'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
+                'medical_context': ['medical', 'health', 'hospital', 'doctor']
+            }
+        }
+    def detect_language(self, text: str) -> str:
+        """Detect the language of the input text"""
+        if not text or not text.strip():
+            return 'en'  # Default to English
+        try:
+            # Clean text for better detection
+            cleaned_text = re.sub(r'[^\w\s]', ' ', text)
+            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+            if len(cleaned_text) < 3:
+                return 'en'
+            detected = detect(cleaned_text)
+            # Map detected language to our supported languages
+            language_mapping = {
+                'vi': 'vi',  # Vietnamese
+                'zh-cn': 'zh',  # Chinese Simplified
+                'zh-tw': 'zh',  # Chinese Traditional
+                'zh': 'zh',     # Chinese
+                'en': 'en'      # English
+            }
+            return language_mapping.get(detected, 'en')
+        except LangDetectException as e:
+            logger.warning(f"Language detection failed: {e}")
+            return 'en'
+    def enhance_query(self, query: str, target_language: str = None) -> Dict[str, str]:
+        """Enhance query for better search results in multiple languages"""
+        if not query or not query.strip():
+            return {}
+        # Detect source language
+        source_language = self.detect_language(query)
+        # If target language not specified, use source language
+        if target_language is None:
+            target_language = source_language
+        enhanced_queries = {}
+        # Original query
+        enhanced_queries[source_language] = query
+        # Enhance for source language
+        if source_language in self.language_enhancements:
+            enhanced_queries[source_language] = self._enhance_for_language(
+                query, source_language
+            )
+        # Create translations for other languages if needed
+        if target_language != source_language:
+            enhanced_queries[target_language] = self._translate_query(
+                query, source_language, target_language
+            )
+        # Add English version for comprehensive search
+        if 'en' not in enhanced_queries:
+            if source_language != 'en':
+                enhanced_queries['en'] = self._translate_query(query, source_language, 'en')
+            else:
+                enhanced_queries['en'] = query
+        return enhanced_queries
+    def _enhance_for_language(self, query: str, language: str) -> str:
+        """Enhance query for a specific language"""
+        enhancements = self.language_enhancements.get(language, {})
+        common_terms = enhancements.get('common_terms', [])
+        medical_context = enhancements.get('medical_context', [])
+        # Check if query already contains medical context
+        query_lower = query.lower()
+        has_medical_context = any(term in query_lower for term in medical_context)
+        # If no medical context, add it
+        if not has_medical_context and medical_context:
+            # Add the most relevant medical context term
+            query += f" {medical_context[0]}"
+        # Check if query is a question and add relevant terms
+        if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
+            if common_terms:
+                query += f" {common_terms[0]}"  # Add "causes" or equivalent
+        return query.strip()
+    def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
+        """Simple keyword-based translation for medical terms"""
+        # This is a basic implementation - in production, you'd use a proper translation service
+        # Medical term translations
+        translations = {
+            ('vi', 'en'): {
+                'triệu chứng': 'symptoms',
+                'đau': 'pain',
+                'đau đầu': 'headache',
+                'sốt': 'fever',
+                'ho': 'cough',
+                'điều trị': 'treatment',
+                'thuốc': 'medicine',
+                'bệnh': 'disease',
+                'bác sĩ': 'doctor',
+                'sức khỏe': 'health',
+                'bệnh viện': 'hospital'
+            },
+            ('zh', 'en'): {
+                '症状': 'symptoms',
+                '疼痛': 'pain',
+                '头痛': 'headache',
+                '发烧': 'fever',
+                '咳嗽': 'cough',
+                '治疗': 'treatment',
+                '药物': 'medicine',
+                '疾病': 'disease',
+                '医生': 'doctor',
+                '健康': 'health',
+                '医院': 'hospital'
+            },
+            ('en', 'vi'): {
+                'symptoms': 'triệu chứng',
+                'pain': 'đau',
+                'headache': 'đau đầu',
+                'fever': 'sốt',
+                'cough': 'ho',
+                'treatment': 'điều trị',
+                'medicine': 'thuốc',
+                'disease': 'bệnh',
+                'doctor': 'bác sĩ',
+                'health': 'sức khỏe',
+                'hospital': 'bệnh viện'
+            },
+            ('en', 'zh'): {
+                'symptoms': '症状',
+                'pain': '疼痛',
+                'headache': '头痛',
+                'fever': '发烧',
+                'cough': '咳嗽',
+                'treatment': '治疗',
+                'medicine': '药物',
+                'disease': '疾病',
+                'doctor': '医生',
+                'health': '健康',
+                'hospital': '医院'
+            }
+        }
+        translation_map = translations.get((source_lang, target_lang), {})
+        # Simple word-by-word translation
+        translated_query = query
+        for source_term, target_term in translation_map.items():
+            translated_query = translated_query.replace(source_term, target_term)
+        return translated_query
+    def get_medical_relevance_score(self, text: str, language: str) -> float:
+        """Calculate medical relevance score for text in a specific language"""
+        if not text:
+            return 0.0
+        keywords = self.medical_keywords.get(language, [])
+        if not keywords:
+            return 0.0
+        text_lower = text.lower()
+        matches = sum(1 for keyword in keywords if keyword in text_lower)
+        # Normalize by text length and keyword count
+        score = matches / max(len(keywords), 1)
+        # Boost score for longer matches
+        if matches > 0:
+            score *= (1 + matches * 0.1)
+        return min(score, 1.0)
+    def filter_by_language(self, results: List[Dict], target_language: str) -> List[Dict]:
+        """Filter results by language preference"""
+        if not results:
+            return results
+        # Score results by language match
+        scored_results = []
+        for result in results:
+            result_language = result.get('language', 'en')
+            language_score = 1.0 if result_language == target_language else 0.5
+            # Add language score to result
+            result_copy = result.copy()
+            result_copy['language_score'] = language_score
+            scored_results.append(result_copy)
+        # Sort by language score (prefer target language)
+        scored_results.sort(key=lambda x: x.get('language_score', 0), reverse=True)
+        return scored_results

search/processors/sources.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import logging
+from typing import List, Dict, Tuple, Set
+import re
+from urllib.parse import urlparse
+from collections import defaultdict
+logger = logging.getLogger(__name__)
+class SourceAggregator:
+    """Aggregate and process sources for comprehensive information extraction"""
+    def __init__(self):
+        # (Removed credibility scoring; keep placeholder map for future use)
+        self.source_credibility = {
+            # English sources
+            'mayoclinic.org': 0.95,
+            'webmd.com': 0.90,
+            'healthline.com': 0.88,
+            'medlineplus.gov': 0.95,
+            'nih.gov': 0.98,
+            'cdc.gov': 0.98,
+            'who.int': 0.97,
+            'pubmed.ncbi.nlm.nih.gov': 0.96,
+            'uptodate.com': 0.94,
+            'merckmanuals.com': 0.92,
+            'medscape.com': 0.89,
+            # Vietnamese sources
+            'hellobacsi.com': 0.85,
+            'alobacsi.com': 0.82,
+            'vinmec.com': 0.88,
+            'tamanhhospital.vn': 0.85,
+            'medlatec.vn': 0.83,
+            'suckhoedoisong.vn': 0.90,
+            'viendinhduong.vn': 0.87,
+            # Chinese sources
+            'haodf.com': 0.86,
+            'dxy.cn': 0.89,
+            'chunyuyisheng.com': 0.84,
+            'xywy.com': 0.82,
+            'jiankang.com': 0.80,
+            'familydoctor.com.cn': 0.85,
+            # Video platforms
+            'youtube.com': 0.70,
+            'medscape.com': 0.89
+        }
+        # Source type classification
+        self.source_types = {
+            'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
+            'hospital': ['mayoclinic.org', 'vinmec.com', 'tamanhhospital.vn'],
+            'commercial': ['webmd.com', 'healthline.com', 'hellobacsi.com'],
+            'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
+            'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
+            'video': ['youtube.com', 'medscape.com']
+        }
+    def aggregate_sources(self, search_results: List[Dict], video_results: List[Dict] = None) -> Dict[str, any]:
+        """Aggregate all sources and create comprehensive reference system"""
+        all_sources = []
+        # Process search results
+        for result in search_results:
+            source_info = self._process_source(result)
+            if source_info:
+                all_sources.append(source_info)
+        # Process video results
+        if video_results:
+            for video in video_results:
+                video_info = self._process_video_source(video)
+                if video_info:
+                    all_sources.append(video_info)
+        # Remove duplicates and score sources
+        unique_sources = self._deduplicate_sources(all_sources)
+        scored_sources = self._score_sources(unique_sources)
+        # Create comprehensive reference mapping
+        reference_mapping = self._create_reference_mapping(scored_sources)
+        # Generate source summary
+        source_summary = self._generate_source_summary(scored_sources)
+        return {
+            'sources': scored_sources,
+            'reference_mapping': reference_mapping,
+            'source_summary': source_summary,
+            'total_sources': len(scored_sources),
+            'languages': self._get_language_distribution(scored_sources),
+            'source_types': self._get_source_type_distribution(scored_sources)
+        }
+    def _process_source(self, result: Dict) -> Dict:
+        """Process a single search result into standardized source format"""
+        url = (result or {}).get('url', '')
+        if not url:
+            return None
+        domain = self._extract_domain(url)
+        source_type = self._classify_source_type(domain)
+        # Normalize fields with safe defaults
+        title = str(result.get('title', '') or '').strip()
+        content = str(result.get('content', '') or '')
+        language = (result.get('language') or 'en').lower()
+        source_name = str(result.get('source', '') or '')
+        platform = str(result.get('platform', '') or '')
+        return {
+            'url': url,
+            'title': title,
+            'content': content,
+            'domain': domain,
+            'source_type': source_type,
+            'language': language,
+            'source_name': source_name,
+            'platform': platform,
+            'type': 'text'
+        }
+    def _process_video_source(self, video: Dict) -> Dict:
+        """Process a video result into standardized source format"""
+        url = (video or {}).get('url', '')
+        if not url:
+            return None
+        domain = self._extract_domain(url)
+        source_type = 'video'
+        title = str(video.get('title', '') or '').strip()
+        language = (video.get('language') or 'en').lower()
+        source_name = str(video.get('source', '') or '')
+        platform = str(video.get('platform', '') or '')
+        return {
+            'url': url,
+            'title': title,
+            'content': '',  # Videos don't have text content
+            'domain': domain,
+            'source_type': source_type,
+            'language': language,
+            'source_name': source_name,
+            'platform': platform,
+            'type': 'video'
+        }
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL"""
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc.lower()
+            # Remove www. prefix
+            if domain.startswith('www.'):
+                domain = domain[4:]
+            return domain
+        except:
+            return ''
+    def _classify_source_type(self, domain: str) -> str:
+        """Classify source type based on domain"""
+        for source_type, domains in self.source_types.items():
+            if domain in domains:
+                return source_type
+        return 'other'
+    def _get_source_credibility(self, domain: str) -> float:
+        """Deprecated: credibility scoring removed. Kept for compatibility."""
+        return 0.0
+    def _deduplicate_sources(self, sources: List[Dict]) -> List[Dict]:
+        """Remove duplicate sources based on URL and title similarity"""
+        seen_urls = set()
+        seen_titles = set()
+        unique_sources = []
+        for source in sources:
+            url = source.get('url', '')
+            title = source.get('title', '').lower().strip()
+            # Check for URL duplicates
+            if url in seen_urls:
+                continue
+            # Check for title similarity (fuzzy matching)
+            title_similar = any(self._titles_similar(title, seen_title) for seen_title in seen_titles)
+            if title_similar:
+                continue
+            seen_urls.add(url)
+            seen_titles.add(title)
+            unique_sources.append(source)
+        return unique_sources
+    def _titles_similar(self, title1: str, title2: str, threshold: float = 0.8) -> bool:
+        """Check if two titles are similar (simple word overlap)"""
+        if not title1 or not title2:
+            return False
+        words1 = set(title1.split())
+        words2 = set(title2.split())
+        if not words1 or not words2:
+            return False
+        intersection = words1.intersection(words2)
+        union = words1.union(words2)
+        similarity = len(intersection) / len(union) if union else 0
+        return similarity >= threshold
+    def _score_sources(self, sources: List[Dict]) -> List[Dict]:
+        """Score and rank sources by relevance and credibility"""
+        for source in sources:
+            # Calculate composite score
+            content_length = len(source.get('content', ''))
+            title_length = len(source.get('title', ''))
+            # Content quality score
+            content_score = min(content_length / 1000, 1.0)  # Normalize to 0-1
+            # Title quality score
+            title_score = min(title_length / 100, 1.0)  # Normalize to 0-1
+            # Composite score (weighted)
+            composite_score = (
+                content_score * 0.6 +    # 60% content quality
+                title_score * 0.4        # 40% title quality
+            )
+            source['composite_score'] = composite_score
+        # Sort by composite score
+        sources.sort(key=lambda x: x.get('composite_score', 0), reverse=True)
+        return sources
+    def _create_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
+        """Create reference mapping for citations"""
+        reference_mapping = {}
+        for i, source in enumerate(sources, 1):
+            reference_mapping[i] = {
+                'url': source['url'],
+                'title': source['title'],
+                'domain': source['domain'],
+                'source_type': source['source_type'],
+                'language': source['language'],
+                'type': source['type']
+            }
+        return reference_mapping
+    def _generate_source_summary(self, sources: List[Dict]) -> str:
+        """Generate summary of sources used"""
+        if not sources:
+            return "No sources available."
+        # Group by source type
+        type_counts = defaultdict(int)
+        language_counts = defaultdict(int)
+        # credibility removed
+        for source in sources:
+            source_type = source.get('source_type', 'other')
+            language = source.get('language', 'en')
+            type_counts[source_type] += 1
+            language_counts[language] += 1
+        # Generate summary
+        summary_parts = []
+        summary_parts.append(f"**Sources Used ({len(sources)} total):**")
+        # Source types
+        if type_counts:
+            type_summary = ", ".join([f"{count} {type_name}" for type_name, count in type_counts.items()])
+            summary_parts.append(f"• **Types**: {type_summary}")
+        # Languages
+        if language_counts:
+            lang_summary = ", ".join([f"{count} {lang}" for lang, count in language_counts.items()])
+            summary_parts.append(f"• **Languages**: {lang_summary}")
+        # Credibility
+        # credibility info removed
+        return "\n".join(summary_parts)
+    def _get_language_distribution(self, sources: List[Dict]) -> Dict[str, int]:
+        """Get distribution of sources by language"""
+        distribution = defaultdict(int)
+        for source in sources:
+            language = source.get('language', 'en')
+            distribution[language] += 1
+        return dict(distribution)
+    def _get_source_type_distribution(self, sources: List[Dict]) -> Dict[str, int]:
+        """Get distribution of sources by type"""
+        distribution = defaultdict(int)
+        for source in sources:
+            source_type = source.get('source_type', 'other')
+            distribution[source_type] += 1
+        return dict(distribution)
+    def create_comprehensive_references(self, sources: List[Dict], max_references: int = 15) -> str:
+        """Create comprehensive reference list for the response"""
+        if not sources:
+            return ""
+        # Take top sources
+        top_sources = sources[:max_references]
+        reference_parts = []
+        reference_parts.append("**📚 References:**")
+        for i, source in enumerate(top_sources, 1):
+            url = source.get('url', '')
+            title = source.get('title', '')
+            domain = source.get('domain', '')
+            source_type = source.get('source_type', 'other')
+            # credibility removed
+            language = source.get('language', 'en')
+            source_type_icon = source.get('type', 'other')
+            # Create type indicator
+            type_icons = {
+                'academic': '🎓',
+                'hospital': '🏥',
+                'government': '🏛️',
+                'commercial': '💼',
+                'professional': '👨‍⚕️',
+                'video': '📹',
+                'other': '📄'
+            }
+            type_icon = type_icons.get(source_type, '📄')
+            # Create language indicator
+            lang_icons = {
+                'en': '🇺🇸',
+                'vi': '🇻🇳',
+                'zh': '🇨🇳'
+            }
+            lang_icon = lang_icons.get(language, '🌐')
+            reference_line = f"{i}. {type_icon} {lang_icon} [{title}]({url}) - {domain}"
+            reference_parts.append(reference_line)
+        if len(sources) > max_references:
+            reference_parts.append(f"... and {len(sources) - max_references} more sources")
+        return "\n".join(reference_parts)

search/search.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import logging
+from typing import List, Dict, Tuple
+import time
+import hashlib
+from .engines.duckduckgo import DuckDuckGoEngine
+from .engines.video import VideoSearchEngine
+from .coordinator import SearchCoordinator
+# Reranker removed - using simple relevance scoring for cooking content
+from models import summarizer
+logger = logging.getLogger(__name__)
+# Global instances
+_duckduckgo_engine = None
+_video_engine = None
+_reranker = None
+_search_coordinator = None
+# Simple in-memory cache for search results
+_search_cache = {}
+_cache_ttl = 300  # 5 minutes TTL
+def get_duckduckgo_engine() -> DuckDuckGoEngine:
+    """Get or create the global DuckDuckGo engine instance"""
+    global _duckduckgo_engine
+    if _duckduckgo_engine is None:
+        _duckduckgo_engine = DuckDuckGoEngine()
+    return _duckduckgo_engine
+def get_video_engine() -> VideoSearchEngine:
+    """Get or create the global video engine instance"""
+    global _video_engine
+    if _video_engine is None:
+        _video_engine = VideoSearchEngine()
+    return _video_engine
+def get_reranker():
+    """Simple cooking relevance scorer - no complex reranking needed"""
+    return None
+def get_search_coordinator() -> SearchCoordinator:
+    """Get or create the global search coordinator instance"""
+    global _search_coordinator
+    if _search_coordinator is None:
+        _search_coordinator = SearchCoordinator()
+    return _search_coordinator
+def _clean_search_query(query: str) -> str:
+    """Clean search query by removing bullet points and special characters"""
+    if not query:
+        return ""
+    import re
+    # Remove bullet points and special characters
+    cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query)
+    cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned)
+    cleaned = re.sub(r'\s+', ' ', cleaned)
+    cleaned = cleaned.strip()
+    # Remove common prefixes that might confuse search
+    prefixes_to_remove = [
+        r'^(en|vi|zh)\s*:\s*',
+        r'^(search|find|look for)\s+',
+        r'^(how to|what is|what are)\s+',
+    ]
+    for prefix in prefixes_to_remove:
+        cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
+    return cleaned.strip()
+def _boost_cooking_keywords(query: str) -> str:
+    """Add cooking context keywords to improve search relevance"""
+    if not query:
+        return ""
+    # Cooking keywords that boost relevance
+    cooking_boosters = [
+        'recipe', 'cooking', 'culinary', 'technique', 'how to', 'bake', 'roast', 'sear', 'simmer',
+        'ingredients', 'measurements', 'temperature', 'timing', 'substitution', 'variation', 'tips'
+    ]
+    query_lower = query.lower()
+    # If query doesn't contain cooking terms, add context
+    has_cooking = any(term in query_lower for term in cooking_boosters)
+    if not has_cooking:
+        # Add cooking context without being too verbose
+        if len(query.split()) <= 3:
+            return f"{query} cooking recipe technique"
+        else:
+            return f"{query} cooking tutorial"
+    return query
+def _get_cache_key(query: str, num_results: int, target_language: str = None, include_videos: bool = True) -> str:
+    """Generate cache key for search results"""
+    cache_data = f"{query}_{num_results}_{target_language}_{include_videos}"
+    return hashlib.md5(cache_data.encode()).hexdigest()
+def _get_cached_results(cache_key: str) -> Tuple[str, Dict[int, str], Dict]:
+    """Get cached search results if available and not expired"""
+    if cache_key not in _search_cache:
+        return None, None, None
+    cached_data = _search_cache[cache_key]
+    if time.time() - cached_data['timestamp'] > _cache_ttl:
+        # Cache expired
+        del _search_cache[cache_key]
+        return None, None, None
+    logger.info(f"Using cached search results for key: {cache_key[:8]}...")
+    return cached_data['search_context'], cached_data['url_mapping'], cached_data['source_aggregation']
+def _cache_results(cache_key: str, search_context: str, url_mapping: Dict[int, str], source_aggregation: Dict):
+    """Cache search results"""
+    _search_cache[cache_key] = {
+        'search_context': search_context,
+        'url_mapping': url_mapping,
+        'source_aggregation': source_aggregation,
+        'timestamp': time.time()
+    }
+    logger.info(f"Cached search results for key: {cache_key[:8]}...")
+class WebSearcher:
+    """Legacy wrapper for backward compatibility"""
+    def __init__(self):
+        self.coordinator = get_search_coordinator()
+        self.max_results = 10
+        self.timeout = 10
+    def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
+        """Search using the new coordinator system"""
+        try:
+            cleaned_query = _clean_search_query(query)
+            return self.coordinator.quick_search(cleaned_query, num_results)
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return []
+    def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
+        """Search using DuckDuckGo engine"""
+        try:
+            cleaned_query = _clean_search_query(query)
+            return self.coordinator.quick_search(cleaned_query, num_results)
+        except Exception as e:
+            logger.error(f"DuckDuckGo search failed: {e}")
+            return []
+    def extract_content(self, url: str) -> str:
+        """Extract content using the new content extractor"""
+        try:
+            return self.coordinator.content_extractor.extract(url)
+        except Exception as e:
+            logger.error(f"Content extraction failed: {e}")
+            return ""
+    def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
+        """Search and extract content using the new system"""
+        try:
+            # Clean the query first
+            cleaned_query = _clean_search_query(query)
+            # Get search results
+            results = self.coordinator.quick_search(cleaned_query, num_results)
+            # Extract content for each result
+            enriched_results = []
+            for result in results:
+                content = self.extract_content(result['url'])
+                if content:
+                    enriched_result = result.copy()
+                    enriched_result['content'] = content
+                    enriched_results.append(enriched_result)
+            return enriched_results
+        except Exception as e:
+            logger.error(f"Search and extract failed: {e}")
+            return []
+# Main search function for backward compatibility
+def search_web(query: str, num_results: int = 10) -> List[Dict]:
+    """Main search function using the new coordinator system"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.quick_search(cleaned_query, num_results)
+    except Exception as e:
+        logger.error(f"Web search failed: {e}")
+        return []
+# Enhanced search function with content extraction
+def search_web_with_content(query: str, num_results: int = 10) -> Tuple[str, Dict[int, str]]:
+    """Enhanced search with content extraction and summarization"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.search(cleaned_query, num_results)
+    except Exception as e:
+        logger.error(f"Enhanced web search failed: {e}")
+        return "", {}
+# Cooking-focused search function
+def search_cooking(query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
+    """Cooking-focused search with enhanced processing"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.cooking_focus_search(cleaned_query, num_results)
+    except Exception as e:
+        logger.error(f"Cooking search failed: {e}")
+        return "", {}
+# Multilingual cooking search function
+def search_multilingual_cooking(query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
+    """Comprehensive multilingual cooking search supporting English, Vietnamese, and Chinese"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.multilingual_cooking_search(cleaned_query, num_results, target_language)
+    except Exception as e:
+        logger.error(f"Multilingual cooking search failed: {e}")
+        return "", {}
+# Video search function
+def search_videos(query: str, num_results: int = 2, target_language: str = None) -> List[Dict]:
+    """Search for cooking videos across multiple platforms"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.video_search(cleaned_query, num_results, target_language)
+    except Exception as e:
+        logger.error(f"Video search failed: {e}")
+        return []
+# Comprehensive search function with maximum information extraction
+def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
+    """Comprehensive search with maximum information extraction and detailed references"""
+    logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
+    # Check cache first
+    cache_key = _get_cache_key(query, num_results, target_language, include_videos)
+    cached_context, cached_mapping, cached_aggregation = _get_cached_results(cache_key)
+    if cached_context is not None:
+        return cached_context, cached_mapping, cached_aggregation
+    # Clean and boost the query for better cooking relevance
+    cleaned_query = _clean_search_query(query)
+    boosted_query = _boost_cooking_keywords(cleaned_query)
+    logger.info(f"Query processing: '{query}' -> '{cleaned_query}' -> '{boosted_query}'")
+    # Get engines
+    duckduckgo_engine = get_duckduckgo_engine()
+    video_engine = get_video_engine()
+    reranker = get_reranker()
+    # Optimized search strategy: get just enough results for good filtering
+    # Calculate optimal initial count based on expected filtering ratio
+    expected_filter_ratio = 0.4  # Expect to keep ~40% after filtering
+    optimal_initial_count = max(num_results * 2, int(num_results / expected_filter_ratio))
+    # Search for text results with optimized count
+    text_results = duckduckgo_engine.search(boosted_query, optimal_initial_count)
+    logger.info(f"Found {len(text_results)} text results (requested {optimal_initial_count})")
+    # If no text results, try simple fallback search
+    if not text_results:
+        logger.warning("No text results found, trying simple fallback search")
+        try:
+            # Try with a very simple query
+            simple_query = " ".join(cleaned_query.split()[:3])  # First 3 words only
+            text_results = duckduckgo_engine.search(simple_query, num_results)
+            logger.info(f"Simple fallback found {len(text_results)} results")
+        except Exception as e:
+            logger.warning(f"Simple fallback search failed: {e}")
+    # Search for videos if requested (limit to avoid over-fetching)
+    video_results = []
+    if include_videos:
+        try:
+            # Map language codes for video search
+            lang_mapping = {
+                'EN': 'en',
+                'VI': 'vi',
+                'ZH': 'zh',
+                'en': 'en',
+                'vi': 'vi',
+                'zh': 'zh'
+            }
+            search_language = lang_mapping.get(target_language, 'en')
+            # Limit video results to avoid over-fetching
+            max_video_results = min(5, num_results // 3)  # Max 5 or 1/3 of total
+            video_results = video_engine.search(boosted_query, num_results=max_video_results, language=search_language)
+            logger.info(f"Found {len(video_results)} video results")
+        except Exception as e:
+            logger.warning(f"Video search failed: {e}")
+    # Combine all results
+    all_results = text_results + video_results
+    # Simple cooking relevance filtering
+    if all_results:
+        # Filter by cooking relevance using simple keyword matching
+        cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
+        relevant_results = []
+        for result in all_results:
+            title = result.get('title', '').lower()
+            content = result.get('content', '').lower()
+            if any(keyword in title or keyword in content for keyword in cooking_keywords):
+                relevant_results.append(result)
+        if relevant_results:
+            all_results = relevant_results
+            logger.info(f"Filtered to {len(all_results)} cooking-relevant results")
+    # Limit final results to requested count
+    all_results = all_results[:num_results]
+    # Final safety check - ensure we have at least some results
+    if not all_results and text_results:
+        logger.warning("No results after processing, using original text results as fallback")
+        all_results = text_results[:num_results]
+    # Create URL mapping
+    url_mapping = {}
+    for i, result in enumerate(all_results, 1):
+        url_mapping[i] = result.get('url', '')
+    # Create search context using summarizer (only for top results)
+    search_context = ""
+    if all_results:
+        summaries = []
+        # Only summarize top results to avoid over-processing
+        top_results = all_results[:min(10, len(all_results))]
+        for i, result in enumerate(top_results, 1):
+            content = result.get('content', '') or result.get('title', '')
+            if content:
+                # Use query-focused summarization
+                summary = summarizer.summarize_for_query(content, boosted_query, max_length=300)
+                if summary:
+                    summaries.append(f"Document {i}: {summary}")
+        search_context = "\n\n".join(summaries)
+    # Create source aggregation
+    source_aggregation = {
+        'total_sources': len(all_results),
+        'text_sources': len(text_results),
+        'video_sources': len(video_results),
+        'sources': all_results
+    }
+    logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
+    # Cache the results
+    _cache_results(cache_key, search_context, url_mapping, source_aggregation)
+    return search_context, url_mapping, source_aggregation

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Utils package
+from .translation import translate_query
+from .vlm import process_medical_image
+from .diagnosis import retrieve_diagnosis_from_symptoms

utils/migrate.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Running this script to split FAISS index collection to the second/different cluster.
+from pymongo import MongoClient
+from dotenv import load_dotenv
+import os
+def migrate_faiss_index():
+    """Migrate FAISS index from QA cluster to index cluster"""
+    # Load environment variables from .env
+    load_dotenv()
+    # Connection strings (update as needed)
+    mongo_uri = os.getenv("MONGO_URI")  # QA cluster connection string
+    index_uri = os.getenv("INDEX_URI")  # FAISS index cluster connection string
+    if not mongo_uri:
+        raise ValueError("MONGO_URI is missing!")
+    if not index_uri:
+        raise ValueError("INDEX_URI is missing!")
+    # Connect to the QA cluster (where FAISS data was accidentally stored)
+    qa_client = MongoClient(mongo_uri)
+    qa_db = qa_client["MedicalChatbotDB"]
+    # Connect to the FAISS index cluster
+    faiss_client = MongoClient(index_uri)
+    faiss_db = faiss_client["MedicalChatbotDB"]  # Use the same database name if desired
+    # Define the GridFS collections to move.
+    # In GridFS, files are stored in two collections: "<bucket>.files" and "<bucket>.chunks".
+    source_files = qa_db["faiss_index_files.files"]
+    source_chunks = qa_db["faiss_index_files.chunks"]
+    dest_files = faiss_db["faiss_index_files.files"]
+    dest_chunks = faiss_db["faiss_index_files.chunks"]
+    print("Moving FAISS index GridFS files...")
+    # Copy documents from the source 'files' collection
+    for doc in source_files.find():
+        dest_files.insert_one(doc)
+    # Copy documents from the source 'chunks' collection
+    for doc in source_chunks.find():
+        dest_chunks.insert_one(doc)
+    print("✅ FAISS GridFS collections moved successfully.")
+    # Optionally, drop the old collections from the QA cluster to free up space:
+    qa_db.drop_collection("faiss_index_files.files")
+    qa_db.drop_collection("faiss_index_files.chunks")
+    print("Old FAISS GridFS collections dropped from the QA cluster.")
+# Only run when called directly
+if __name__ == "__main__":
+    migrate_faiss_index()

utils/symbipredict_2022.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/translation.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# translation.py
+from transformers import pipeline
+import logging
+import re
+from collections import Counter
+logger = logging.getLogger("translation-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
+# To use lazy model loader
+vi_en = None
+zh_en = None
+def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
+    """Collapse excessive repeated n-grams and repeated phrases with improved logic."""
+    if not s:
+        return s
+    # Collapse repeated spaces/newlines
+    s = re.sub(r"\s+", " ", s).strip()
+    # More aggressive repetition detection
+    # Check for simple word repetition (like "a lot of people do not" repeated)
+    words = s.split()
+    if len(words) > 20:  # Only check if text is long enough
+        # Look for repeated sequences of 3-8 words
+        for seq_len in range(8, 2, -1):
+            if len(words) < seq_len * 3:  # Need at least 3 repetitions
+                continue
+            # Check each possible starting position
+            for start in range(len(words) - seq_len * 2):
+                sequence = words[start:start + seq_len]
+                # Count how many times this sequence repeats
+                repeat_count = 1
+                pos = start + seq_len
+                while pos + seq_len <= len(words):
+                    if words[pos:pos + seq_len] == sequence:
+                        repeat_count += 1
+                        pos += seq_len
+                    else:
+                        break
+                # If we found 3+ repetitions, remove the excess
+                if repeat_count >= 3:
+                    # Keep only the first occurrence
+                    new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:]
+                    s = " ".join(new_words)
+                    words = s.split()
+                    break
+            else:
+                continue
+            break  # Break outer loop if we found and fixed a repetition
+    # Additional cleanup for remaining patterns
+    # Remove consecutive identical word
+    tokens = s.split()
+    out = []
+    last = None
+    for t in tokens:
+        if last is None or t.lower() != last.lower():
+            out.append(t)
+        last = t
+    s = " ".join(out)
+    # Limit consecutive duplicate n-grams
+    for n in range(n_max, n_min - 1, -1):
+        pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
+        s = pattern.sub(r"\1", s)
+    return s
+def _normalize_and_cap(s: str, cap: int = 512) -> str:
+    if not s:
+        return s
+    s = s.strip()
+    if len(s) > cap:
+        s = s[:cap]
+    return s
+def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
+    if not s:
+        return False
+    tokens = [t.lower() for t in s.split()]
+    if len(tokens) < 10:
+        return False
+    counts = Counter(tokens)
+    top = counts.most_common(1)[0][1]
+    return (top / max(1, len(tokens))) >= threshold
+def translate_query(text: str, lang_code: str) -> str:
+    global vi_en, zh_en
+    if not text or not text.strip():
+        return text
+    try:
+        if lang_code == "vi":
+            if vi_en is None:
+                logger.info("[Translation] Loading Vietnamese-English model...")
+                vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
+            # Limit input length to prevent model issues
+            input_text = text[:1000] if len(text) > 1000 else text
+            raw = vi_en(input_text, max_length=512)[0]["translation_text"]
+            cleaned = _dedupe_repeats(raw)
+            norm = _normalize_and_cap(cleaned, cap=512)
+            if _is_too_repetitive(norm) or len(norm.strip()) < 10:
+                logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text")
+                return text
+            logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...")
+            return norm
+        elif lang_code == "zh":
+            if zh_en is None:
+                logger.info("[Translation] Loading Chinese-English model...")
+                zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
+            # Limit input length to prevent model issues
+            input_text = text[:1000] if len(text) > 1000 else text
+            raw = zh_en(input_text, max_length=512)[0]["translation_text"]
+            cleaned = _dedupe_repeats(raw)
+            norm = _normalize_and_cap(cleaned, cap=512)
+            if _is_too_repetitive(norm) or len(norm.strip()) < 10:
+                logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text")
+                return text
+            logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...")
+            return norm
+    except Exception as e:
+        logger.error(f"[Translation] Translation failed for {lang_code}: {e}")
+        return text  # Fallback to original text
+    return text

utils/vlm.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os, logging, traceback, json, base64
+from io import BytesIO
+from PIL import Image
+from .translation import translate_query
+from gradio_client import Client, handle_file
+import tempfile
+logger = logging.getLogger("vlm-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True)
+# ✅ Load Gradio client once
+gr_client = None
+def load_gradio_client():
+    global gr_client
+    if gr_client is None:
+        logger.info("[VLM] ⏳ Connecting to MedGEMMA Gradio Space...")
+        gr_client = Client("warshanks/medgemma-4b-it")
+        logger.info("[VLM] Gradio MedGEMMA client ready.")
+    return gr_client
+def process_medical_image(base64_image: str, prompt: str = None, lang: str = "EN") -> str:
+    if not prompt:
+        prompt = "Describe and investigate any clinical findings from this medical image."
+    elif lang.upper() in {"VI", "ZH"}:
+        prompt = translate_query(prompt, lang.lower())
+    try:
+        # 1️⃣ Decode base64 image to temp file
+        image_data = base64.b64decode(base64_image)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            tmp.write(image_data)
+            tmp.flush()
+            image_path = tmp.name
+        # 2️⃣ Send to Gradio MedGEMMA
+        client = load_gradio_client()
+        logger.info(f"[VLM] Sending prompt: {prompt}")
+        result = client.predict(
+            message={"text": prompt, "files": [handle_file(image_path)]},
+            param_2 = "You analyze medical images and report abnormalities, diseases with clear diagnostic insight.",
+            param_3=2048,
+            api_name="/chat"
+        )
+        if isinstance(result, str):
+            logger.info(f"[VLM] ✅ Response: {result}")
+            return result.strip()
+        else:
+            logger.warning(f"[VLM] ⚠️ Unexpected result type: {type(result)} — {result}")
+            return str(result)
+    except Exception as e:
+        logger.error(f"[VLM] ❌ Exception: {e}")
+        logger.error(f"[VLM] 🔍 Traceback:\n{traceback.format_exc()}")
+        return f"[VLM] ⚠️ Failed to process image: {e}"