LiamKhoaLe commited on
Commit
21446aa
·
0 Parent(s):

Init commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ api/legacy.py
2
+ *.md
3
+ .env
4
+ *yml
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ secrets.toml
.huggingface.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ sdk: docker
2
+ app_file: app.py
3
+ port: 7860
4
+ hardware: cpu-basic
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # Create and use a non-root user (optional)
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Copy all project files to the container
12
+ COPY . .
13
+
14
+ # Install dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Verify CSV file is present and accessible
18
+ RUN ls -la /app/utils/symbipredict_2022.csv || echo "CSV file not found"
19
+
20
+ # Test CSV loading in Docker environment
21
+ RUN python /app/test_docker_csv.py
22
+
23
+ # Clean up test file
24
+ RUN rm /app/test_docker_csv.py
25
+
26
+ # Set Hugging Face cache directory to persist model downloads
27
+ ENV HF_HOME="/home/user/.cache/huggingface"
28
+ ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
29
+ ENV MEDGEMMA_HOME="/home/user/.cache/huggingface/sentence-transformers"
30
+
31
+ # Create cache directories and ensure permissions
32
+ RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
33
+ chown -R user:user /app/model_cache /home/user/.cache/huggingface
34
+
35
+ # Pre-load model in a separate script
36
+ RUN python /app/models/download_model.py && python /app/models/warmup.py
37
+
38
+ # Ensure ownership and permissions remain intact
39
+ RUN chown -R user:user /app/model_cache
40
+
41
+ # Expose port
42
+ EXPOSE 7860
43
+
44
+ # Run the application using main.py as entry point
45
+ CMD ["python", "main.py"]
README.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cooking Tutor API
3
+ emoji: 👨‍🍳
4
+ colorFrom: orange
5
+ colorTo: red
6
+ sdk: docker
7
+ sdk_version: latest
8
+ pinned: false
9
+ license: apache-2.0
10
+ short_description: Cooking Tutor with WebSearch, Memory, Multilingual
11
+ ---
12
+
13
+ # Cooking Tutor Backend
14
+
15
+ ## At-a-glance
16
+ Production-grade cooking assistant with web search integration, conversation memory, multilingual support, and comprehensive recipe guidance.
17
+
18
+ ## Key Features
19
+
20
+ ### 🔍 Web Search Integration
21
+ - Curated cooking sources (AllRecipes, Food Network, Epicurious, etc.)
22
+ - Content extraction and summarization
23
+ - Citation mapping with clickable URLs
24
+ - Cooking relevance filtering
25
+
26
+ ### 🧠 Memory & Retrieval
27
+ - Conversation memory with FAISS indexing
28
+ - Semantic chunking and summarization
29
+ - Context builder for conversation continuity
30
+ - Up to 20 recent summaries per user
31
+
32
+ ### 🌍 Multilingual Support
33
+ - Vietnamese and Chinese translation
34
+ - Language detection and query enhancement
35
+ - Fallback handling for translation failures
36
+
37
+ ### 🍳 Cooking Focus
38
+ - Specialized cooking keyword filtering
39
+ - Recipe and technique guidance
40
+ - Ingredient substitution suggestions
41
+ - Cooking time and temperature guidance
42
+
43
+ ## Usage
44
+
45
+ ### Running the Application
46
+ ```bash
47
+ # Using main entry point
48
+ python main.py
49
+
50
+ # Or directly
51
+ python api/app.py
52
+ ```
53
+
54
+ ### Environment Variables
55
+ - `FlashAPI` - Gemini API key (required)
56
+ - `NVIDIA_URI` - Optional for advanced features
57
+ - `NVIDIA_RERANK_ENDPOINT` - Optional reranker endpoint
58
+
59
+ ## API Endpoints
60
+
61
+ ### POST `/chat`
62
+ Main chat endpoint with cooking guidance.
63
+
64
+ **Request Body:**
65
+ ```json
66
+ {
67
+ "query": "How to make perfect pasta?",
68
+ "lang": "EN",
69
+ "search": true,
70
+ "user_id": "unique_user_id",
71
+ "servings": 4,
72
+ "dietary": ["vegetarian"],
73
+ "skill_level": "beginner",
74
+ "structured": true
75
+ }
76
+ ```
77
+
78
+ **Response:**
79
+ ```json
80
+ {
81
+ "response": "Cooking guidance with citations <URL>",
82
+ "response_time": "2.34s"
83
+ }
84
+ ```
85
+
86
+ ## Search Mode Features
87
+
88
+ When `search: true`:
89
+ 1. Search curated cooking sources
90
+ 2. Extract and summarize relevant content
91
+ 3. Filter by cooking relevance
92
+ 4. Provide citations with clickable URLs
93
+
94
+ ## Memory Features
95
+
96
+ - **Conversation Continuity**: Maintains context across sessions
97
+ - **Semantic Chunking**: Groups related cooking topics
98
+ - **Usage Tracking**: Prioritizes frequently used information
99
+ - **Time Decay**: Recent conversations get higher priority
100
+
101
+ ## Folders Overview
102
+ - `api/` - FastAPI app, routes, chatbot orchestration
103
+ - `models/` - Summarizer and processing models
104
+ - `memory/` - Memory manager and FAISS interfaces
105
+ - `search/` - Web search engines and processors
106
+ - `utils/` - Translation and utility functions
107
+
108
+ ## Dependencies
109
+
110
+ See `requirements.txt` for complete list. Key components:
111
+ - `google-genai` - Gemini API integration
112
+ - `faiss-cpu` - Vector similarity search
113
+ - `sentence-transformers` - Text embeddings
114
+ - `transformers` - Translation models
115
+ - `requests` - Web search functionality
116
+ - `beautifulsoup4` - HTML content extraction
api/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Module Structure - Cooking Tutor
2
+
3
+ ## 📁 **Module Overview**
4
+
5
+ ### **config.py** - Configuration Management
6
+ - Environment variables validation
7
+ - Logging configuration
8
+ - System resource monitoring
9
+ - Memory optimization settings
10
+ - CORS configuration
11
+
12
+ ### **retrieval.py** - Web Search Integration
13
+ - Cooking information retrieval via web search
14
+ - Recipe suggestion system
15
+ - Smart content filtering and relevance scoring
16
+ - Web search result processing
17
+
18
+ ### **chatbot.py** - Core Chatbot Logic
19
+ - CookingTutorChatbot class
20
+ - Gemini API client
21
+ - Web search integration
22
+ - Citation processing
23
+ - Memory management integration
24
+
25
+ ### **routes.py** - API Endpoints
26
+ - `/chat` - Main chat endpoint
27
+ - `/health` - Health check
28
+ - `/` - Root endpoint with landing page
29
+ - Request/response handling
30
+
31
+ ### **app.py** - Main Application
32
+ - FastAPI app initialization
33
+ - Middleware configuration
34
+ - Route registration
35
+ - Server startup
36
+
37
+ ## 🔄 **Data Flow**
38
+
39
+ ```
40
+ Request → routes.py → chatbot.py → search.py (web search)
41
+
42
+ memory.py (context) + utils/ (translation)
43
+
44
+ models/ (summarization processing)
45
+
46
+ Response with citations
47
+ ```
48
+
49
+ ## 🚀 **Benefits of Modular Structure**
50
+
51
+ 1. **Separation of Concerns**: Each module has a single responsibility
52
+ 2. **Easier Testing**: Individual modules can be tested in isolation
53
+ 3. **Better Maintainability**: Changes to one module don't affect others
54
+ 4. **Improved Readability**: Smaller files are easier to understand
55
+ 5. **Reusability**: Modules can be imported and used elsewhere
56
+ 6. **Scalability**: Easy to add new features without affecting existing code
57
+
58
+ ## 📊 **File Sizes**
59
+
60
+ | File | Lines | Purpose |
61
+ |------|-------|---------|
62
+ | **app.py** | 50 | Main app initialization |
63
+ | **config.py** | 68 | Configuration |
64
+ | **retrieval.py** | 156 | Web search integration |
65
+ | **chatbot.py** | 203 | Chatbot logic |
66
+ | **routes.py** | 435 | API endpoints |
67
+
68
+ ## 🔧 **Usage**
69
+
70
+ The modular structure maintains clean API interface:
71
+
72
+ ```python
73
+ # All imports work the same way
74
+ from api.app import app
75
+ from api.chatbot import CookingTutorChatbot
76
+ from api.retrieval import retrieval_engine
77
+ ```
78
+
79
+ ## 🛠 **Development Benefits**
80
+
81
+ - **Easier Debugging**: Issues can be isolated to specific modules
82
+ - **Parallel Development**: Multiple developers can work on different modules
83
+ - **Code Reviews**: Smaller files are easier to review
84
+ - **Documentation**: Each module can have focused documentation
85
+ - **Testing**: Unit tests can be written for each module independently
api/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # API package
2
+ # Main API endpoints and routes
api/app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/app_new.py
2
+ import uvicorn
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from .config import setup_logging, check_system_resources, optimize_memory, CORS_ORIGINS, validate_environment
6
+ from .routes import router
7
+
8
+ # ✅ Validate environment
9
+ validate_environment()
10
+
11
+ # ✅ Setup logging
12
+ logger = setup_logging()
13
+ logger.info("🍳 Starting Cooking Tutor API...")
14
+
15
+ # ✅ Monitor system resources
16
+ check_system_resources(logger)
17
+
18
+ # ✅ Optimize memory usage
19
+ optimize_memory()
20
+
21
+ # ✅ Initialize FastAPI app
22
+ app = FastAPI(
23
+ title="Cooking Tutor API",
24
+ description="AI-powered cooking lesson and recipe tutoring with web search",
25
+ version="1.0.0"
26
+ )
27
+
28
+ # ✅ Add CORS middleware
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=CORS_ORIGINS,
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ # No database initialization required for cooking tutor (web-search only)
38
+
39
+ # ✅ Include routes
40
+ app.include_router(router)
41
+
42
+ # ✅ Run Uvicorn
43
+ if __name__ == "__main__":
44
+ logger.info("[System] ✅ Starting FastAPI Server...")
45
+ try:
46
+ uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
47
+ except Exception as e:
48
+ logger.error(f"❌ Server Startup Failed: {e}")
49
+ exit(1)
api/chatbot.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/chatbot.py
2
+ import re
3
+ import logging
4
+ from typing import Dict
5
+ from google import genai
6
+ from .config import gemini_flash_api_key
7
+ from memory import MemoryManager
8
+ from utils import translate_query
9
+ from search import search_comprehensive
10
+ # Safety guard removed - cooking tutor doesn't need medical safety checks
11
+
12
+ logger = logging.getLogger("cooking-tutor")
13
+
14
+ class GeminiClient:
15
+ """Gemini API client for generating responses"""
16
+
17
+ def __init__(self):
18
+ self.client = genai.Client(api_key=gemini_flash_api_key)
19
+
20
+ def generate_content(self, prompt: str, model: str = "gemini-2.5-flash", temperature: float = 0.7) -> str:
21
+ """Generate content using Gemini API"""
22
+ try:
23
+ response = self.client.models.generate_content(model=model, contents=prompt)
24
+ return response.text
25
+ except Exception as e:
26
+ logger.error(f"[LLM] ❌ Error calling Gemini API: {e}")
27
+ return "Error generating response from Gemini."
28
+
29
+ class CookingTutorChatbot:
30
+ """Cooking tutor chatbot that uses only web search + memory."""
31
+
32
+ def __init__(self, model_name: str):
33
+ self.model_name = model_name
34
+ self.gemini_client = GeminiClient()
35
+ self.memory = MemoryManager()
36
+
37
+ def chat(
38
+ self,
39
+ user_id: str,
40
+ user_query: str,
41
+ lang: str = "EN",
42
+ search_mode: bool = True,
43
+ video_mode: bool = False,
44
+ servings: int = None,
45
+ dietary: list = None,
46
+ allergens: list = None,
47
+ equipment: list = None,
48
+ time_limit_minutes: int = None,
49
+ skill_level: str = None,
50
+ cuisine: str = None,
51
+ structured: bool = False,
52
+ ) -> str:
53
+ # Translate to English-centric search if needed
54
+ if lang.upper() in {"VI", "ZH"}:
55
+ user_query = translate_query(user_query, lang.lower())
56
+
57
+ # Basic cooking relevance check
58
+ cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
59
+ query_lower = user_query.lower()
60
+ if not any(keyword in query_lower for keyword in cooking_keywords):
61
+ logger.warning(f"[SAFETY] Non-cooking query detected: {user_query}")
62
+ return "⚠️ I'm a cooking tutor! Please ask me about recipes, cooking techniques, ingredients, or anything food-related."
63
+
64
+ # Conversation memory (recent turns)
65
+ contextual_chunks = self.memory.get_contextual_chunks(user_id, user_query, lang)
66
+
67
+ # Web search context
68
+ search_context = ""
69
+ url_mapping = {}
70
+ source_aggregation = {}
71
+ video_results = []
72
+
73
+ if search_mode:
74
+ try:
75
+ search_context, url_mapping, source_aggregation = search_comprehensive(
76
+ f"cooking technique tutorial: {user_query}",
77
+ num_results=12,
78
+ target_language=lang,
79
+ include_videos=bool(video_mode)
80
+ )
81
+ if video_mode and source_aggregation:
82
+ video_results = source_aggregation.get('sources', []) or []
83
+ except Exception as e:
84
+ logger.error(f"[SEARCH] Failed: {e}")
85
+
86
+ # Build prompt
87
+ parts = [
88
+ "You are a professional cooking tutor and recipe coach.",
89
+ "Provide step-by-step, practical instructions with exact measurements, temperatures, and timings.",
90
+ "Offer substitutions, variations, pantry-friendly swaps, and troubleshooting tips.",
91
+ "Adapt guidance to different skill levels (beginner/intermediate/advanced).",
92
+ "Use Markdown with headings, numbered steps, bullet lists, and short paragraphs.",
93
+ "Always include a concise Ingredients list when relevant.",
94
+ "Cite sources inline using <#ID> tags already present in the search context when applicable.",
95
+ ]
96
+
97
+ # Constraints block
98
+ constraints = []
99
+ if servings:
100
+ constraints.append(f"Servings: {servings}")
101
+ if dietary:
102
+ constraints.append(f"Dietary preferences: {', '.join(dietary)}")
103
+ if allergens:
104
+ constraints.append(f"Avoid allergens: {', '.join(allergens)}")
105
+ if equipment:
106
+ constraints.append(f"Available equipment: {', '.join(equipment)}")
107
+ if time_limit_minutes:
108
+ constraints.append(f"Time limit: {time_limit_minutes} minutes")
109
+ if skill_level:
110
+ constraints.append(f"Skill level: {skill_level}")
111
+ if cuisine:
112
+ constraints.append(f"Cuisine: {cuisine}")
113
+
114
+ if constraints:
115
+ parts.append("Constraints to respect:\n- " + "\n- ".join(constraints))
116
+
117
+ if contextual_chunks:
118
+ parts.append("Relevant context from previous messages:\n" + contextual_chunks)
119
+ if search_context:
120
+ parts.append("Cooking knowledge from the web (with citations):\n" + search_context)
121
+
122
+ parts.append(f"User's cooking question: {user_query}")
123
+ parts.append(f"Language to generate answer: {lang}")
124
+
125
+ if structured:
126
+ parts.append(
127
+ "Return a Markdown response with these sections if relevant:"
128
+ "\n1. Title"
129
+ "\n2. Summary (2-3 sentences)"
130
+ "\n3. Ingredients (quantities in metric and US units)"
131
+ "\n4. Equipment"
132
+ "\n5. Step-by-step Instructions (numbered)"
133
+ "\n6. Timing & Temperatures"
134
+ "\n7. Variations & Substitutions"
135
+ "\n8. Troubleshooting & Doneness Cues"
136
+ "\n9. Storage & Reheating"
137
+ "\n10. Sources"
138
+ )
139
+
140
+ prompt = "\n\n".join(parts)
141
+ response = self.gemini_client.generate_content(prompt, model=self.model_name, temperature=0.6)
142
+
143
+ # Process citations
144
+ if url_mapping:
145
+ response = self._process_citations(response, url_mapping)
146
+
147
+ # Basic cooking relevance check for response
148
+ if response and len(response) > 50:
149
+ response_lower = response.lower()
150
+ if not any(keyword in response_lower for keyword in cooking_keywords):
151
+ logger.warning(f"[SAFETY] Non-cooking response detected, redirecting to cooking topic")
152
+ response = "⚠️ Let's stick to cooking-related topics. Try asking about recipes, techniques, or ingredients!"
153
+
154
+ if user_id:
155
+ self.memory.add_exchange(user_id, user_query, response, lang=lang)
156
+
157
+ if video_mode and video_results:
158
+ return {
159
+ 'text': response.strip(),
160
+ 'videos': video_results
161
+ }
162
+ return response.strip()
163
+
164
+ def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
165
+ """Replace citation tags with actual URLs, handling both single and multiple references"""
166
+
167
+ # Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
168
+ citation_pattern = r'<#([^>]+)>'
169
+
170
+ def replace_citation(match):
171
+ citation_content = match.group(1)
172
+ # Split by comma and clean up each citation ID
173
+ citation_ids = [id_str.strip() for id_str in citation_content.split(',')]
174
+
175
+ urls = []
176
+ for citation_id in citation_ids:
177
+ try:
178
+ doc_id = int(citation_id)
179
+ if doc_id in url_mapping:
180
+ url = url_mapping[doc_id]
181
+ urls.append(f'<{url}>')
182
+ logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
183
+ else:
184
+ logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
185
+ urls.append(f'<#{doc_id}>') # Keep original if URL not found
186
+ except ValueError:
187
+ logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
188
+ urls.append(f'<#{citation_id}>') # Keep original if invalid
189
+
190
+ # Join multiple URLs with spaces
191
+ return ' '.join(urls)
192
+
193
+ # Replace citations with URLs
194
+ processed_response = re.sub(citation_pattern, replace_citation, response)
195
+
196
+ # Count total citations processed
197
+ citations_found = re.findall(citation_pattern, response)
198
+ total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
199
+ for citation_content in citations_found)
200
+
201
+ logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
202
+ return processed_response
api/config.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/config.py
2
+ import os
3
+ import logging
4
+ import psutil
5
+ from typing import List
6
+
7
+ # ✅ Environment Variables
8
+ gemini_flash_api_key = os.getenv("FlashAPI")
9
+
10
+ # Validate environment endpoint (only when actually running the app)
11
+ def validate_environment():
12
+ if not gemini_flash_api_key:
13
+ raise ValueError("❌ Missing FlashAPI key for Gemini. Set env var FlashAPI.")
14
+
15
+ # ✅ Logging Configuration
16
+ def setup_logging():
17
+ """Configure logging for the application"""
18
+ # Silence noisy loggers
19
+ for name in [
20
+ "uvicorn.error", "uvicorn.access",
21
+ "fastapi", "starlette",
22
+ "pymongo", "gridfs",
23
+ "sentence_transformers", "faiss",
24
+ "google", "google.auth",
25
+ ]:
26
+ logging.getLogger(name).setLevel(logging.WARNING)
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s — %(name)s — %(levelname)s — %(message)s",
31
+ force=True
32
+ )
33
+
34
+ logger = logging.getLogger("cooking-tutor")
35
+ logger.setLevel(logging.DEBUG)
36
+ return logger
37
+
38
+ # ✅ System Resource Monitoring
39
+ def check_system_resources(logger):
40
+ """Monitor system resources and log warnings"""
41
+ memory = psutil.virtual_memory()
42
+ cpu = psutil.cpu_percent(interval=1)
43
+ disk = psutil.disk_usage("/")
44
+
45
+ logger.info(f"[System] 🔍 System Resources - RAM: {memory.percent}%, CPU: {cpu}%, Disk: {disk.percent}%")
46
+
47
+ if memory.percent > 85:
48
+ logger.warning("⚠️ High RAM usage detected!")
49
+ if cpu > 90:
50
+ logger.warning("⚠️ High CPU usage detected!")
51
+ if disk.percent > 90:
52
+ logger.warning("⚠️ High Disk usage detected!")
53
+
54
+ # ✅ Memory Optimization
55
+ def optimize_memory():
56
+ """Set environment variables for memory optimization"""
57
+ os.environ["OMP_NUM_THREADS"] = "1"
58
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
59
+
60
+ # ✅ CORS Configuration
61
+ CORS_ORIGINS = [
62
+ "http://localhost:5173", # Vite dev server
63
+ "http://localhost:3000", # Another vercel local dev
64
+ "https://cooking-tutor.vercel.app", # ✅ Vercel frontend production URL
65
+ ]
66
+
67
+ # No embedding/RAG models used in cooking tutor
api/retrieval.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/retrieval.py
2
+ import os
3
+ import re
4
+ import time
5
+ import requests
6
+ import numpy as np
7
+ import logging
8
+ from typing import List, Dict
9
+ # Database removed - cooking tutor uses web search only
10
+ from models import summarizer
11
+
12
+ logger = logging.getLogger("retrieval-bot")
13
+
14
+ class RetrievalEngine:
15
+ def __init__(self):
16
+ # Database removed - cooking tutor uses web search only
17
+ self._reranker = None
18
+
19
+ def _get_reranker(self):
20
+ """Initialize the NVIDIA reranker on first use."""
21
+ if self._reranker is None:
22
+ self._reranker = _NvidiaReranker()
23
+ return self._reranker
24
+
25
+ @staticmethod
26
+ def _is_cooking_guide_text(text: str) -> bool:
27
+ """Heuristic to detect cooking guide content."""
28
+ if not text:
29
+ return False
30
+ keywords = [
31
+ # common cooking guide indicators
32
+ r"\bguideline(s)?\b", r"\bcooking practice\b", r"\brecommend(ation|ed|s)?\b",
33
+ r"\bshould\b", r"\bmust\b", r"\bstrongly (recommend|suggest)\b",
34
+ r"\brecipe\b", r"\btechnique\b", r"\bmethod\b", r"\binstruction\b",
35
+ r"\btemperature\b", r"\btiming\b", r"\bmeasurement\b"
36
+ ]
37
+ text_lc = text.lower()
38
+ return any(re.search(p, text_lc, flags=re.IGNORECASE) for p in keywords)
39
+
40
+ @staticmethod
41
+ def _extract_cooking_guide_sentences(text: str) -> str:
42
+ """Extract likely cooking guide sentences to reduce conversational/noisy content before summarization."""
43
+ if not text:
44
+ return ""
45
+ sentences = re.split(r"(?<=[.!?])\s+", text)
46
+ keep_patterns = [
47
+ r"\b(recommend|should|must|preferred|first-choice|consider)\b",
48
+ r"\b(temperature|timing|measurement|portion|serving)\b",
49
+ r"\b(ingredient|seasoning|spice|herb|sauce|marinade)\b",
50
+ r"\b(prepare|cook|bake|roast|grill|fry|boil|steam)\b"
51
+ ]
52
+ kept = []
53
+ for s in sentences:
54
+ s_norm = s.strip()
55
+ if not s_norm:
56
+ continue
57
+ if any(re.search(p, s_norm, flags=re.IGNORECASE) for p in keep_patterns):
58
+ kept.append(s_norm)
59
+ # Fallback: if filtering too aggressive, keep truncated original
60
+ if not kept:
61
+ return text[:1200]
62
+ return " ".join(kept)[:2000]
63
+
64
+ def retrieve_cooking_info(self, query: str, k: int = 5, min_sim: float = 0.8) -> list:
65
+ """
66
+ Retrieve cooking information - placeholder for web search integration
67
+ """
68
+ # This method is kept for compatibility but cooking tutor uses web search
69
+ logger.info(f"[Retrieval] Cooking info retrieval requested for: {query}")
70
+ return [""]
71
+
72
+ def retrieve_recipe_suggestions(self, ingredient_text: str, top_k: int = 5, min_sim: float = 0.5) -> list:
73
+ """
74
+ Retrieve recipe suggestions from ingredients - placeholder for web search integration
75
+ """
76
+ # This method is kept for compatibility but cooking tutor uses web search
77
+ logger.info(f"[Retrieval] Recipe suggestions requested for ingredients: {ingredient_text}")
78
+ return [""]
79
+
80
+ # Global retrieval engine instance
81
+ retrieval_engine = RetrievalEngine()
82
+
83
+
84
+ class _NvidiaReranker:
85
+ """Simple client for NVIDIA NIM reranking: nvidia/rerank-qa-mistral-4b"""
86
+ def __init__(self):
87
+ self.api_key = os.getenv("NVIDIA_URI")
88
+ # Use provider doc model identifier
89
+ self.model = os.getenv("NVIDIA_RERANK_MODEL", "nv-rerank-qa-mistral-4b:1")
90
+ # NIM rerank endpoint (subject to environment); keep configurable
91
+ self.base_url = os.getenv("NVIDIA_RERANK_ENDPOINT", "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking")
92
+ self.timeout_s = 30
93
+
94
+ def rerank(self, query: str, documents: List[str]) -> List[Dict]:
95
+ if not self.api_key:
96
+ raise ValueError("NVIDIA_URI not set for reranker")
97
+ if not documents:
98
+ return []
99
+ headers = {
100
+ "Authorization": f"Bearer {self.api_key}",
101
+ "Content-Type": "application/json",
102
+ "Accept": "application/json",
103
+ }
104
+ # Truncate and limit candidates to avoid 4xx
105
+ docs = documents[:10]
106
+ docs = [d[:2000] for d in docs if isinstance(d, str)]
107
+ # Two payload shapes based on provider doc
108
+ payloads = [
109
+ {
110
+ "model": self.model,
111
+ "query": {"text": query},
112
+ "passages": [{"text": d} for d in docs],
113
+ },
114
+ {
115
+ "model": self.model,
116
+ "query": query,
117
+ "documents": [{"text": d} for d in docs],
118
+ },
119
+ ]
120
+ try:
121
+ data = None
122
+ for p in payloads:
123
+ resp = requests.post(self.base_url, headers=headers, json=p, timeout=self.timeout_s)
124
+ if resp.status_code >= 400:
125
+ # try next shape
126
+ continue
127
+ data = resp.json()
128
+ break
129
+ if data is None:
130
+ # last attempt for diagnostics
131
+ resp.raise_for_status()
132
+ # Expecting a list with scores and indices or texts
133
+ results = []
134
+ entries = data.get("results") or data.get("data") or []
135
+ if isinstance(entries, list) and entries:
136
+ for entry in entries:
137
+ # Common patterns: {index, score} or {text, score}
138
+ idx = entry.get("index")
139
+ text = entry.get("text") if entry.get("text") else (documents[idx] if idx is not None and idx < len(documents) else None)
140
+ score = entry.get("score", 0)
141
+ if text:
142
+ results.append({"text": text, "score": float(score)})
143
+ else:
144
+ # Fallback: if API returns scores aligned to input order
145
+ scores = data.get("scores")
146
+ if isinstance(scores, list) and len(scores) == len(documents):
147
+ for t, s in zip(documents, scores):
148
+ results.append({"text": t, "score": float(s)})
149
+ # Sort by score desc
150
+ results.sort(key=lambda x: x.get("score", 0), reverse=True)
151
+ return results
152
+ except Exception as e:
153
+ logger.warning(f"[Reranker] Failed calling NVIDIA reranker: {e}")
154
+ # On failure, return original order with neutral scores
155
+ return [{"text": d, "score": 0.0} for d in documents]
api/routes.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/routes.py
2
+ import time
3
+ import os
4
+ import re
5
+ import json
6
+ import logging
7
+ import uuid
8
+ from datetime import datetime, timedelta
9
+ from fastapi import APIRouter, Request
10
+ from fastapi.responses import JSONResponse, HTMLResponse
11
+ from .chatbot import CookingTutorChatbot
12
+
13
+ logger = logging.getLogger("routes")
14
+
15
+ # Create router
16
+ router = APIRouter()
17
+
18
+ # Initialize cooking tutor chatbot
19
+ chatbot = CookingTutorChatbot(
20
+ model_name="gemini-2.5-flash"
21
+ )
22
+
23
+ @router.post("/chat")
24
+ async def chat_endpoint(req: Request):
25
+ """Chat endpoint (web-search only). No DB persistence, no image handling."""
26
+ body = await req.json()
27
+ user_id = body.get("user_id", "anonymous")
28
+ query_raw = body.get("query")
29
+ query = query_raw.strip() if isinstance(query_raw, str) else ""
30
+ lang = body.get("lang", "EN")
31
+ search_mode = body.get("search", True)
32
+ video_mode = body.get("video", False)
33
+ # Optional cooking constraints
34
+ servings = body.get("servings")
35
+ dietary = body.get("dietary") # e.g., ["vegetarian", "gluten-free"]
36
+ allergens = body.get("allergens") # e.g., ["peanuts", "shellfish"]
37
+ equipment = body.get("equipment") # e.g., ["oven", "cast iron skillet"]
38
+ time_limit = body.get("time_limit_minutes") # e.g., 30
39
+ skill_level = body.get("skill_level") # beginner|intermediate|advanced
40
+ cuisine = body.get("cuisine") # e.g., "Italian"
41
+ structured = body.get("structured", False)
42
+
43
+ start = time.time()
44
+ try:
45
+ answer = chatbot.chat(
46
+ user_id,
47
+ query,
48
+ lang,
49
+ search_mode,
50
+ video_mode,
51
+ servings=servings,
52
+ dietary=dietary,
53
+ allergens=allergens,
54
+ equipment=equipment,
55
+ time_limit_minutes=time_limit,
56
+ skill_level=skill_level,
57
+ cuisine=cuisine,
58
+ structured=structured,
59
+ )
60
+ elapsed = time.time() - start
61
+
62
+ # Handle response format (might be string or dict with videos)
63
+ if isinstance(answer, dict):
64
+ response_text = answer.get('text', '')
65
+ video_data = answer.get('videos', [])
66
+ else:
67
+ response_text = answer
68
+ video_data = []
69
+
70
+ # Final response
71
+ response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
72
+
73
+ # Include video data if available
74
+ if video_data:
75
+ response_data["videos"] = video_data
76
+
77
+ return JSONResponse(response_data)
78
+
79
+ except Exception as e:
80
+ logger.error(f"[REQUEST] Error processing request: {e}")
81
+ return JSONResponse({"response": "❌ Failed to get a response. Please try again."})
82
+
83
+ @router.get("/check-request/{request_id}")
84
+ async def check_request_status(request_id: str):
85
+ """Legacy endpoint kept for compatibility; returns not supported."""
86
+ return JSONResponse({"status": "unsupported"})
87
+
88
+ @router.get("/pending-requests/{user_id}")
89
+ async def get_pending_requests(user_id: str):
90
+ """Legacy endpoint kept for compatibility; returns empty list."""
91
+ return JSONResponse({"requests": []})
92
+
93
+ @router.delete("/cleanup-requests")
94
+ async def cleanup_old_requests():
95
+ """Legacy endpoint kept for compatibility; no-op."""
96
+ return JSONResponse({"deleted_count": 0})
97
+
98
+ @router.get("/health")
99
+ async def health_check():
100
+ """Health check endpoint"""
101
+ return {"status": "healthy", "service": "cooking-tutor"}
102
+
103
+ @router.get("/")
104
+ async def root():
105
+ """Root endpoint - Landing page with redirect to main app"""
106
+
107
+ html_content = """
108
+ <!DOCTYPE html>
109
+ <html lang="en">
110
+ <head>
111
+ <meta charset="UTF-8">
112
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
113
+ <title>Cooking Tutor API</title>
114
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
115
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
116
+ <style>
117
+ * {
118
+ margin: 0;
119
+ padding: 0;
120
+ box-sizing: border-box;
121
+ }
122
+
123
+ body {
124
+ font-family: 'Inter', sans-serif;
125
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
126
+ min-height: 100vh;
127
+ display: flex;
128
+ align-items: center;
129
+ justify-content: center;
130
+ overflow: hidden;
131
+ position: relative;
132
+ }
133
+
134
+ /* Animated background particles */
135
+ .particles {
136
+ position: absolute;
137
+ top: 0;
138
+ left: 0;
139
+ width: 100%;
140
+ height: 100%;
141
+ overflow: hidden;
142
+ z-index: 1;
143
+ }
144
+
145
+ .particle {
146
+ position: absolute;
147
+ background: rgba(255, 255, 255, 0.1);
148
+ border-radius: 50%;
149
+ animation: float 6s ease-in-out infinite;
150
+ }
151
+
152
+ .particle:nth-child(1) { width: 80px; height: 80px; top: 20%; left: 10%; animation-delay: 0s; }
153
+ .particle:nth-child(2) { width: 120px; height: 120px; top: 60%; left: 80%; animation-delay: 2s; }
154
+ .particle:nth-child(3) { width: 60px; height: 60px; top: 80%; left: 20%; animation-delay: 4s; }
155
+ .particle:nth-child(4) { width: 100px; height: 100px; top: 10%; left: 70%; animation-delay: 1s; }
156
+ .particle:nth-child(5) { width: 90px; height: 90px; top: 40%; left: 50%; animation-delay: 3s; }
157
+
158
+ @keyframes float {
159
+ 0%, 100% { transform: translateY(0px) rotate(0deg); opacity: 0.7; }
160
+ 50% { transform: translateY(-20px) rotate(180deg); opacity: 1; }
161
+ }
162
+
163
+ .container {
164
+ background: rgba(255, 255, 255, 0.1);
165
+ backdrop-filter: blur(20px);
166
+ border: 1px solid rgba(255, 255, 255, 0.2);
167
+ border-radius: 24px;
168
+ padding: 3rem 2rem;
169
+ text-align: center;
170
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
171
+ max-width: 500px;
172
+ width: 90%;
173
+ position: relative;
174
+ z-index: 2;
175
+ animation: slideUp 0.8s ease-out;
176
+ }
177
+
178
+ @keyframes slideUp {
179
+ from {
180
+ opacity: 0;
181
+ transform: translateY(50px);
182
+ }
183
+ to {
184
+ opacity: 1;
185
+ transform: translateY(0);
186
+ }
187
+ }
188
+
189
+ .logo {
190
+ width: 80px;
191
+ height: 80px;
192
+ background: linear-gradient(135deg, #f59e0b 0%, #ef4444 100%);
193
+ border-radius: 20px;
194
+ display: flex;
195
+ align-items: center;
196
+ justify-content: center;
197
+ margin: 0 auto 1.5rem;
198
+ animation: pulse 2s ease-in-out infinite;
199
+ }
200
+
201
+ @keyframes pulse {
202
+ 0%, 100% { transform: scale(1); }
203
+ 50% { transform: scale(1.05); }
204
+ }
205
+
206
+ .logo i {
207
+ font-size: 2rem;
208
+ color: white;
209
+ }
210
+
211
+ h1 {
212
+ color: white;
213
+ font-size: 2.5rem;
214
+ font-weight: 700;
215
+ margin-bottom: 0.5rem;
216
+ background: linear-gradient(135deg, #ffffff 0%, #f0f9ff 100%);
217
+ -webkit-background-clip: text;
218
+ -webkit-text-fill-color: transparent;
219
+ background-clip: text;
220
+ }
221
+
222
+ .subtitle {
223
+ color: rgba(255, 255, 255, 0.8);
224
+ font-size: 1.1rem;
225
+ margin-bottom: 2rem;
226
+ font-weight: 400;
227
+ }
228
+
229
+ .version {
230
+ color: rgba(255, 255, 255, 0.6);
231
+ font-size: 0.9rem;
232
+ margin-bottom: 2rem;
233
+ font-weight: 300;
234
+ }
235
+
236
+ .redirect-btn {
237
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
238
+ color: white;
239
+ border: none;
240
+ padding: 1rem 2rem;
241
+ border-radius: 12px;
242
+ font-size: 1.1rem;
243
+ font-weight: 600;
244
+ cursor: pointer;
245
+ transition: all 0.3s ease;
246
+ text-decoration: none;
247
+ display: inline-flex;
248
+ align-items: center;
249
+ gap: 0.5rem;
250
+ box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
251
+ position: relative;
252
+ overflow: hidden;
253
+ }
254
+
255
+ .redirect-btn::before {
256
+ content: '';
257
+ position: absolute;
258
+ top: 0;
259
+ left: -100%;
260
+ width: 100%;
261
+ height: 100%;
262
+ background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
263
+ transition: left 0.5s;
264
+ }
265
+
266
+ .redirect-btn:hover::before {
267
+ left: 100%;
268
+ }
269
+
270
+ .redirect-btn:hover {
271
+ transform: translateY(-2px);
272
+ box-shadow: 0 12px 30px rgba(102, 126, 234, 0.4);
273
+ }
274
+
275
+ .redirect-btn:active {
276
+ transform: translateY(0);
277
+ }
278
+
279
+ .redirect-btn i {
280
+ font-size: 1.2rem;
281
+ transition: transform 0.3s ease;
282
+ }
283
+
284
+ .redirect-btn:hover i {
285
+ transform: translateX(3px);
286
+ }
287
+
288
+ .features {
289
+ margin-top: 2rem;
290
+ display: grid;
291
+ grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
292
+ gap: 1rem;
293
+ }
294
+
295
+ .feature {
296
+ color: rgba(255, 255, 255, 0.7);
297
+ font-size: 0.9rem;
298
+ font-weight: 500;
299
+ }
300
+
301
+ .feature i {
302
+ display: block;
303
+ font-size: 1.5rem;
304
+ margin-bottom: 0.5rem;
305
+ color: rgba(255, 255, 255, 0.9);
306
+ }
307
+
308
+ @media (max-width: 768px) {
309
+ .container {
310
+ padding: 2rem 1.5rem;
311
+ margin: 1rem;
312
+ }
313
+
314
+ h1 {
315
+ font-size: 2rem;
316
+ }
317
+
318
+ .subtitle {
319
+ font-size: 1rem;
320
+ }
321
+
322
+ .redirect-btn {
323
+ padding: 0.8rem 1.5rem;
324
+ font-size: 1rem;
325
+ }
326
+ }
327
+ </style>
328
+ </head>
329
+ <body>
330
+ <div class="particles">
331
+ <div class="particle"></div>
332
+ <div class="particle"></div>
333
+ <div class="particle"></div>
334
+ <div class="particle"></div>
335
+ <div class="particle"></div>
336
+ </div>
337
+
338
+ <div class="container">
339
+ <div class="logo">
340
+ <i class="fas fa-utensils"></i>
341
+ </div>
342
+
343
+ <h1>Cooking Tutor</h1>
344
+ <p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
345
+ <p class="version">API Version 1.0.0</p>
346
+
347
+ <a href="/" class="redirect-btn" target="_blank">
348
+ <i class="fas fa-external-link-alt"></i>
349
+ Open Frontend
350
+ </a>
351
+
352
+ <div class="features">
353
+ <div class="feature">
354
+ <i class="fas fa-seedling"></i>
355
+ Friendly
356
+ </div>
357
+ <div class="feature">
358
+ <i class="fas fa-list-ol"></i>
359
+ Step-by-step
360
+ </div>
361
+ <div class="feature">
362
+ <i class="fas fa-globe"></i>
363
+ Multi-Language
364
+ </div>
365
+ </div>
366
+ </div>
367
+
368
+ <script>
369
+ // Add some interactive effects
370
+ document.addEventListener('DOMContentLoaded', function() {
371
+ const btn = document.querySelector('.redirect-btn');
372
+ const particles = document.querySelectorAll('.particle');
373
+
374
+ // Add click animation
375
+ btn.addEventListener('click', function(e) {
376
+ // Create ripple effect
377
+ const ripple = document.createElement('span');
378
+ const rect = this.getBoundingClientRect();
379
+ const size = Math.max(rect.width, rect.height);
380
+ const x = e.clientX - rect.left - size / 2;
381
+ const y = e.clientY - rect.top - size / 2;
382
+
383
+ ripple.style.cssText = `
384
+ position: absolute;
385
+ width: ${size}px;
386
+ height: ${size}px;
387
+ left: ${x}px;
388
+ top: ${y}px;
389
+ background: rgba(255, 255, 255, 0.3);
390
+ border-radius: 50%;
391
+ transform: scale(0);
392
+ animation: ripple 0.6s ease-out;
393
+ pointer-events: none;
394
+ `;
395
+
396
+ this.appendChild(ripple);
397
+
398
+ setTimeout(() => {
399
+ ripple.remove();
400
+ }, 600);
401
+ });
402
+
403
+ // Add CSS for ripple animation
404
+ const style = document.createElement('style');
405
+ style.textContent = `
406
+ @keyframes ripple {
407
+ to {
408
+ transform: scale(2);
409
+ opacity: 0;
410
+ }
411
+ }
412
+ `;
413
+ document.head.appendChild(style);
414
+
415
+ // Animate particles on mouse move
416
+ document.addEventListener('mousemove', function(e) {
417
+ const x = e.clientX / window.innerWidth;
418
+ const y = e.clientY / window.innerHeight;
419
+
420
+ particles.forEach((particle, index) => {
421
+ const speed = (index + 1) * 0.5;
422
+ const xOffset = (x - 0.5) * speed * 20;
423
+ const yOffset = (y - 0.5) * speed * 20;
424
+
425
+ particle.style.transform = `translate(${xOffset}px, ${yOffset}px)`;
426
+ });
427
+ });
428
+ });
429
+ </script>
430
+ </body>
431
+ </html>
432
+ """
433
+
434
+ return HTMLResponse(content=html_content)
main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py - Entry point for the Cooking Tutor API
2
+ import uvicorn
3
+ from api.app import app
4
+
5
+ if __name__ == "__main__":
6
+ print("🍳 Starting Cooking Tutor API...")
7
+ uvicorn.run(
8
+ app,
9
+ host="0.0.0.0",
10
+ port=7860,
11
+ log_level="info",
12
+ reload=False # Set to True for development
13
+ )
memory/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Memory package
2
+ from .memory import MemoryManager
memory/memory.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # memory_updated.py
2
+ import re, time, hashlib, asyncio, os
3
+ from collections import defaultdict, deque
4
+ from typing import List, Dict
5
+ import numpy as np
6
+ import faiss
7
+ from sentence_transformers import SentenceTransformer
8
+ from google import genai # must be configured in app.py and imported globally
9
+ import logging
10
+ from models.summarizer import get_summarizer
11
+
12
+ _LLM_SMALL = "gemini-2.5-flash-lite-preview-06-17"
13
+ # Load embedding model
14
+ EMBED = SentenceTransformer("/app/model_cache", device="cpu").half()
15
+ logger = logging.getLogger("rag-agent")
16
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
17
+
18
+ api_key = os.getenv("FlashAPI")
19
+ client = genai.Client(api_key=api_key)
20
+
21
+ class MemoryManager:
22
+ def __init__(self, max_users=1000, history_per_user=20, max_chunks=60):
23
+ # STM: recent conversation summaries (topic + summary), up to 5 entries
24
+ self.stm_summaries = defaultdict(lambda: deque(maxlen=history_per_user)) # deque of {topic,text,vec,timestamp,used}
25
+ # Legacy raw cache (kept for compatibility if needed)
26
+ self.text_cache = defaultdict(lambda: deque(maxlen=history_per_user))
27
+ # LTM: semantic chunk store (approx 3 chunks x 20 rounds)
28
+ self.chunk_index = defaultdict(self._new_index) # user_id -> faiss index
29
+ self.chunk_meta = defaultdict(list) # '' -> list[{text,tag,vec,timestamp,used}]
30
+ self.user_queue = deque(maxlen=max_users) # LRU of users
31
+ self.max_chunks = max_chunks # hard cap per user
32
+ self.chunk_cache = {} # hash(query+resp) -> [chunks]
33
+
34
+ # ---------- Public API ----------
35
+ def add_exchange(self, user_id: str, query: str, response: str, lang: str = "EN"):
36
+ self._touch_user(user_id)
37
+ # Keep raw record (optional)
38
+ self.text_cache[user_id].append(((query or "").strip(), (response or "").strip()))
39
+ if not response: return []
40
+ # Avoid re-chunking identical response
41
+ cache_key = hashlib.md5((query + response).encode()).hexdigest()
42
+ if cache_key in self.chunk_cache:
43
+ chunks = self.chunk_cache[cache_key]
44
+ else:
45
+ chunks = self.chunk_response(response, lang, question=query)
46
+ self.chunk_cache[cache_key] = chunks
47
+ # Update STM with merging/deduplication
48
+ for chunk in chunks:
49
+ self._upsert_stm(user_id, chunk, lang)
50
+ # Update LTM with merging/deduplication
51
+ self._upsert_ltm(user_id, chunks, lang)
52
+ return chunks
53
+
54
+ def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
55
+ """Return texts of chunks whose cosine similarity ≥ min_sim."""
56
+ if self.chunk_index[user_id].ntotal == 0:
57
+ return []
58
+ # Encode chunk
59
+ qvec = self._embed(query)
60
+ sims, idxs = self.chunk_index[user_id].search(np.array([qvec]), k=top_k)
61
+ results = []
62
+ # Append related result with smart-decay to optimize storage and prioritize most-recent chat
63
+ for sim, idx in zip(sims[0], idxs[0]):
64
+ if idx < len(self.chunk_meta[user_id]) and sim >= min_sim:
65
+ chunk = self.chunk_meta[user_id][idx]
66
+ chunk["used"] += 1 # increment usage
67
+ # Decay function
68
+ age_sec = time.time() - chunk["timestamp"]
69
+ decay = 1.0 / (1.0 + age_sec / 300) # 5-min half-life
70
+ score = sim * decay * (1 + 0.1 * chunk["used"])
71
+ # Append chunk with score
72
+ results.append((score, chunk))
73
+ # Sort result on best scored
74
+ results.sort(key=lambda x: x[0], reverse=True)
75
+ # logger.info(f"[Memory] RAG Retrieved Topic: {results}") # Inspect vector data
76
+ return [f"### Topic: {c['tag']}\n{c['text']}" for _, c in results]
77
+
78
+ def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
79
+ """
80
+ Get the most recent short-term memory summaries.
81
+ Returns: a list of entries containing only the summarized bot context.
82
+ """
83
+ if user_id not in self.stm_summaries:
84
+ return []
85
+ recent = list(self.stm_summaries[user_id])[-num_turns:]
86
+ formatted = []
87
+ for entry in recent:
88
+ formatted.append({
89
+ "user": "",
90
+ "bot": f"Topic: {entry['topic']}\n{entry['text']}",
91
+ "timestamp": entry.get("timestamp", time.time())
92
+ })
93
+ return formatted
94
+
95
+ def get_context(self, user_id: str, num_turns: int = 5) -> str:
96
+ # Prefer STM summaries
97
+ history = self.get_recent_chat_history(user_id, num_turns=num_turns)
98
+ return "\n".join(h["bot"] for h in history)
99
+
100
+ def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
101
+ """
102
+ Use NVIDIA Llama to create a summarization of relevant context from both recent history and RAG chunks.
103
+ This ensures conversational continuity while providing a concise summary for the main LLM.
104
+ """
105
+ # Get both types of context
106
+ recent_history = self.get_recent_chat_history(user_id, num_turns=5)
107
+ rag_chunks = self.get_relevant_chunks(user_id, current_query, top_k=3)
108
+
109
+ logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
110
+ logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")
111
+
112
+ # Return empty string if no context is found
113
+ if not recent_history and not rag_chunks:
114
+ logger.info(f"[Contextual] No context found, returning empty string")
115
+ return ""
116
+
117
+ # Prepare context for summarization
118
+ context_parts = []
119
+ # Add recent chat history
120
+ if recent_history:
121
+ history_text = "\n".join([
122
+ f"User: {item['user']}\nBot: {item['bot']}"
123
+ for item in recent_history
124
+ ])
125
+ context_parts.append(f"Recent conversation history:\n{history_text}")
126
+ # Add RAG chunks
127
+ if rag_chunks:
128
+ rag_text = "\n".join(rag_chunks)
129
+ context_parts.append(f"Semantically relevant historical cooking information:\n{rag_text}")
130
+
131
+ # Combine all context
132
+ full_context = "\n\n".join(context_parts)
133
+
134
+ # Use summarizer to create concise summary
135
+ try:
136
+ summary = summarizer.summarize_text(full_context, max_length=300)
137
+ logger.info(f"[Contextual] Generated summary using NVIDIA Llama: {len(summary)} characters")
138
+ return summary
139
+ except Exception as e:
140
+ logger.error(f"[Contextual] Summarization failed: {e}")
141
+ return full_context[:500] + "..." if len(full_context) > 500 else full_context
142
+
143
+ def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
144
+ """
145
+ Use NVIDIA Llama to chunk and summarize response by cooking topics.
146
+ Returns: [{"tag": ..., "text": ...}, ...]
147
+ """
148
+ if not response:
149
+ return []
150
+
151
+ try:
152
+ # Use summarizer to chunk and summarize
153
+ chunks = summarizer.chunk_response(response, max_chunk_size=500)
154
+
155
+ # Convert to the expected format
156
+ result_chunks = []
157
+ for i, chunk in enumerate(chunks):
158
+ # Extract topic from chunk (first sentence or key cooking terms)
159
+ topic = self._extract_topic_from_chunk(chunk)
160
+
161
+ result_chunks.append({
162
+ "tag": topic,
163
+ "text": chunk
164
+ })
165
+
166
+ logger.info(f"[Memory] 📦 NVIDIA Llama summarized {len(result_chunks)} chunks")
167
+ return result_chunks
168
+
169
+ except Exception as e:
170
+ logger.error(f"[Memory] NVIDIA Llama chunking failed: {e}")
171
+ # Fallback to simple chunking
172
+ return self._fallback_chunking(response)
173
+
174
+ def _extract_topic_from_chunk(self, chunk: str) -> str:
175
+ """Extract a concise topic from a chunk"""
176
+ # Look for cooking terms or first sentence
177
+ sentences = chunk.split('.')
178
+ if sentences:
179
+ first_sentence = sentences[0].strip()
180
+ if len(first_sentence) > 50:
181
+ first_sentence = first_sentence[:50] + "..."
182
+ return first_sentence
183
+ return "Cooking Information"
184
+
185
+ def _fallback_chunking(self, response: str) -> List[Dict]:
186
+ """Fallback chunking when NVIDIA Llama fails"""
187
+ # Simple sentence-based chunking
188
+ sentences = re.split(r'[.!?]+', response)
189
+ chunks = []
190
+ current_chunk = ""
191
+
192
+ for sentence in sentences:
193
+ sentence = sentence.strip()
194
+ if not sentence:
195
+ continue
196
+
197
+ if len(current_chunk) + len(sentence) > 300:
198
+ if current_chunk:
199
+ chunks.append({
200
+ "tag": "Cooking Information",
201
+ "text": current_chunk.strip()
202
+ })
203
+ current_chunk = sentence
204
+ else:
205
+ current_chunk += sentence + ". "
206
+
207
+ if current_chunk:
208
+ chunks.append({
209
+ "tag": "Cooking Information",
210
+ "text": current_chunk.strip()
211
+ })
212
+
213
+ return chunks
214
+
215
+ # ---------- Private Methods ----------
216
+ def _touch_user(self, user_id: str):
217
+ """Update LRU queue"""
218
+ if user_id in self.user_queue:
219
+ self.user_queue.remove(user_id)
220
+ self.user_queue.append(user_id)
221
+
222
+ def _new_index(self):
223
+ """Create new FAISS index"""
224
+ return faiss.IndexFlatIP(384) # 384-dim embeddings
225
+
226
+ def _upsert_stm(self, user_id: str, chunk: Dict, lang: str):
227
+ """Update short-term memory with merging/deduplication"""
228
+ topic = chunk["tag"]
229
+ text = chunk["text"]
230
+
231
+ # Check for similar topics in STM
232
+ for entry in self.stm_summaries[user_id]:
233
+ if self._topics_similar(topic, entry["topic"]):
234
+ # Merge with existing entry
235
+ entry["text"] = summarizer.summarize_text(
236
+ f"{entry['text']}\n{text}",
237
+ max_length=200
238
+ )
239
+ entry["timestamp"] = time.time()
240
+ return
241
+
242
+ # Add new entry
243
+ self.stm_summaries[user_id].append({
244
+ "topic": topic,
245
+ "text": text,
246
+ "vec": self._embed(f"{topic} {text}"),
247
+ "timestamp": time.time(),
248
+ "used": 0
249
+ })
250
+
251
+ def _upsert_ltm(self, user_id: str, chunks: List[Dict], lang: str):
252
+ """Update long-term memory with merging/deduplication"""
253
+ for chunk in chunks:
254
+ # Check for similar chunks in LTM
255
+ similar_idx = self._find_similar_chunk(user_id, chunk["text"])
256
+
257
+ if similar_idx is not None:
258
+ # Merge with existing chunk
259
+ existing = self.chunk_meta[user_id][similar_idx]
260
+ merged_text = summarizer.summarize_text(
261
+ f"{existing['text']}\n{chunk['text']}",
262
+ max_length=300
263
+ )
264
+ existing["text"] = merged_text
265
+ existing["timestamp"] = time.time()
266
+ else:
267
+ # Add new chunk
268
+ if len(self.chunk_meta[user_id]) >= self.max_chunks:
269
+ # Remove oldest chunk
270
+ self._remove_oldest_chunk(user_id)
271
+
272
+ vec = self._embed(chunk["text"])
273
+ self.chunk_index[user_id].add(np.array([vec]))
274
+ self.chunk_meta[user_id].append({
275
+ "text": chunk["text"],
276
+ "tag": chunk["tag"],
277
+ "vec": vec,
278
+ "timestamp": time.time(),
279
+ "used": 0
280
+ })
281
+
282
+ def _topics_similar(self, topic1: str, topic2: str) -> bool:
283
+ """Check if two topics are similar"""
284
+ # Simple similarity check based on common words
285
+ words1 = set(topic1.lower().split())
286
+ words2 = set(topic2.lower().split())
287
+ intersection = words1.intersection(words2)
288
+ return len(intersection) >= 2
289
+
290
+ def _find_similar_chunk(self, user_id: str, text: str) -> int:
291
+ """Find similar chunk in LTM"""
292
+ if not self.chunk_meta[user_id]:
293
+ return None
294
+
295
+ text_vec = self._embed(text)
296
+ sims, idxs = self.chunk_index[user_id].search(np.array([text_vec]), k=3)
297
+
298
+ for sim, idx in zip(sims[0], idxs[0]):
299
+ if sim > 0.8: # High similarity threshold
300
+ return int(idx)
301
+ return None
302
+
303
+ def _remove_oldest_chunk(self, user_id: str):
304
+ """Remove the oldest chunk from LTM"""
305
+ if not self.chunk_meta[user_id]:
306
+ return
307
+
308
+ # Find oldest chunk
309
+ oldest_idx = min(range(len(self.chunk_meta[user_id])),
310
+ key=lambda i: self.chunk_meta[user_id][i]["timestamp"])
311
+
312
+ # Remove from index and metadata
313
+ self.chunk_meta[user_id].pop(oldest_idx)
314
+ # Note: FAISS doesn't support direct removal, so we rebuild the index
315
+ self._rebuild_index(user_id)
316
+
317
+ def _rebuild_index(self, user_id: str):
318
+ """Rebuild FAISS index after removal"""
319
+ if not self.chunk_meta[user_id]:
320
+ self.chunk_index[user_id] = self._new_index()
321
+ return
322
+
323
+ vectors = [chunk["vec"] for chunk in self.chunk_meta[user_id]]
324
+ self.chunk_index[user_id] = self._new_index()
325
+ self.chunk_index[user_id].add(np.array(vectors))
326
+
327
+ @staticmethod
328
+ def _embed(text: str):
329
+ vec = EMBED.encode(text, convert_to_numpy=True)
330
+ # L2 normalise for cosine on IndexFlatIP
331
+ return vec / (np.linalg.norm(vec) + 1e-9)
models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Models package
2
+ from .llama import NVIDIALLamaClient, process_search_query
3
+ from .summarizer import TextSummarizer, summarizer
models/download_model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # download_model.py
2
+ ### --- A. transformer and embedder ---
3
+ import os
4
+ import shutil
5
+ from huggingface_hub import snapshot_download
6
+
7
+ # Set up paths
8
+ MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
9
+ MODEL_CACHE_DIR = "/app/model_cache"
10
+
11
+ print("⏳ Downloading the SentenceTransformer model...")
12
+ model_path = snapshot_download(repo_id=MODEL_REPO, cache_dir=MODEL_CACHE_DIR)
13
+
14
+ print("Model path: ", model_path)
15
+
16
+ # Ensure the directory exists
17
+ if not os.path.exists(MODEL_CACHE_DIR):
18
+ os.makedirs(MODEL_CACHE_DIR)
19
+
20
+ # Move all contents from the snapshot folder
21
+ if os.path.exists(model_path):
22
+ print(f"📂 Moving model files from {model_path} to {MODEL_CACHE_DIR}...")
23
+
24
+ for item in os.listdir(model_path):
25
+ source = os.path.join(model_path, item)
26
+ destination = os.path.join(MODEL_CACHE_DIR, item)
27
+
28
+ if os.path.isdir(source):
29
+ shutil.copytree(source, destination, dirs_exist_ok=True)
30
+ else:
31
+ shutil.copy2(source, destination)
32
+
33
+ print(f"✅ Model extracted and flattened in {MODEL_CACHE_DIR}")
34
+ else:
35
+ print("❌ No snapshot directory found!")
36
+ exit(1)
37
+
38
+ # Verify structure after moving
39
+ print("\n📂 LLM Model Structure (Build Level):")
40
+ for root, dirs, files in os.walk(MODEL_CACHE_DIR):
41
+ print(f"📁 {root}/")
42
+ for file in files:
43
+ print(f" 📄 {file}")
44
+
45
+
46
+ ### --- B. translation modules ---
47
+ from transformers import pipeline
48
+ print("⏬ Downloading Vietnamese–English translator...")
49
+ _ = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en")
50
+ print("⏬ Downloading Chinese–English translator...")
51
+ _ = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
models/llama.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import json
4
+ import logging
5
+ import time
6
+ from typing import List, Dict, Tuple
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class NVIDIALLamaClient:
11
+ def __init__(self):
12
+ self.api_key = os.getenv("NVIDIA_URI")
13
+ if not self.api_key:
14
+ raise ValueError("NVIDIA_URI environment variable not set")
15
+
16
+ # Correct NVIDIA Integrate API base
17
+ self.base_url = "https://integrate.api.nvidia.com/v1"
18
+ self.model = "meta/llama-3.1-8b-instruct"
19
+
20
+ def generate_keywords(self, user_query: str) -> List[str]:
21
+ """Use Llama to generate search keywords from user query"""
22
+ try:
23
+ prompt = f"""Given this medical question: "{user_query}"
24
+
25
+ Generate 3-5 specific search keywords that would help find relevant medical information online.
26
+ Focus on medical terms, symptoms, conditions, treatments, or procedures mentioned.
27
+ Return only the keywords separated by commas, no explanations.
28
+
29
+ Keywords:"""
30
+
31
+ response = self._call_llama(prompt)
32
+
33
+ # Extract keywords from response
34
+ keywords = [kw.strip() for kw in response.split(',') if kw.strip()]
35
+ logger.info(f"Generated keywords: {keywords}")
36
+ return keywords[:5] # Limit to 5 keywords
37
+
38
+ except Exception as e:
39
+ logger.error(f"Failed to generate keywords: {e}")
40
+ return [user_query] # Fallback to original query
41
+
42
+ def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
43
+ """Use Llama to summarize documents and return summary with URL mapping"""
44
+ try:
45
+ # Import summarizer here to avoid circular imports
46
+ from summarizer import summarizer
47
+
48
+ # Use the summarizer for document summarization
49
+ combined_summary, url_mapping = summarizer.summarize_documents(documents, user_query)
50
+
51
+ return combined_summary, url_mapping
52
+
53
+ except Exception as e:
54
+ logger.error(f"Failed to summarize documents: {e}")
55
+ return "", {}
56
+
57
+ def _call_llama(self, prompt: str, max_retries: int = 3) -> str:
58
+ """Make API call to NVIDIA Llama model with retry logic"""
59
+ for attempt in range(max_retries):
60
+ try:
61
+ headers = {
62
+ "Authorization": f"Bearer {self.api_key}",
63
+ "Content-Type": "application/json"
64
+ }
65
+
66
+ payload = {
67
+ "model": self.model,
68
+ "messages": [
69
+ {
70
+ "role": "user",
71
+ "content": prompt
72
+ }
73
+ ],
74
+ "temperature": 0.7,
75
+ "max_tokens": 1000
76
+ }
77
+
78
+ response = requests.post(
79
+ f"{self.base_url}/chat/completions",
80
+ headers=headers,
81
+ json=payload,
82
+ timeout=30
83
+ )
84
+
85
+ response.raise_for_status()
86
+ result = response.json()
87
+
88
+ content = result['choices'][0]['message']['content'].strip()
89
+ if not content:
90
+ raise ValueError("Empty response from Llama API")
91
+
92
+ return content
93
+
94
+ except requests.exceptions.Timeout:
95
+ logger.warning(f"Llama API timeout (attempt {attempt + 1}/{max_retries})")
96
+ if attempt == max_retries - 1:
97
+ raise
98
+ time.sleep(2 ** attempt) # Exponential backoff
99
+
100
+ except requests.exceptions.RequestException as e:
101
+ logger.warning(f"Llama API request failed (attempt {attempt + 1}/{max_retries}): {e}")
102
+ if attempt == max_retries - 1:
103
+ raise
104
+ time.sleep(2 ** attempt)
105
+
106
+ except Exception as e:
107
+ logger.error(f"Llama API call failed: {e}")
108
+ raise
109
+
110
+ def process_search_query(user_query: str, search_results: List[Dict]) -> Tuple[str, Dict[int, str]]:
111
+ """Process search results using Llama model"""
112
+ try:
113
+ llama_client = NVIDIALLamaClient()
114
+
115
+ # Generate search keywords
116
+ keywords = llama_client.generate_keywords(user_query)
117
+
118
+ # Summarize documents
119
+ summary, url_mapping = llama_client.summarize_documents(search_results, user_query)
120
+
121
+ return summary, url_mapping
122
+
123
+ except Exception as e:
124
+ logger.error(f"Failed to process search query: {e}")
125
+ return "", {}
models/summarizer.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Dict, Tuple
4
+ from .llama import NVIDIALLamaClient
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class TextSummarizer:
9
+ def __init__(self):
10
+ self.llama_client = NVIDIALLamaClient()
11
+
12
+ def clean_text(self, text: str) -> str:
13
+ """Clean and normalize text for summarization"""
14
+ if not text:
15
+ return ""
16
+
17
+ # Remove common conversation starters and fillers
18
+ conversation_patterns = [
19
+ r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
20
+ r'\b(here is|this is|let me|i will|i can|i would)\b',
21
+ r'\b(summarize|summary|here\'s|here is)\b',
22
+ r'\b(please|kindly|would you|could you)\b',
23
+ r'\b(um|uh|er|ah|well|so|like|you know)\b'
24
+ ]
25
+
26
+ # Remove excessive whitespace and normalize
27
+ text = re.sub(r'\s+', ' ', text)
28
+ text = re.sub(r'\n+', ' ', text)
29
+
30
+ # Remove conversation patterns
31
+ for pattern in conversation_patterns:
32
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE)
33
+
34
+ # Remove extra punctuation and normalize
35
+ text = re.sub(r'[.]{2,}', '.', text)
36
+ text = re.sub(r'[!]{2,}', '!', text)
37
+ text = re.sub(r'[?]{2,}', '?', text)
38
+
39
+ return text.strip()
40
+
41
+ def extract_key_phrases(self, text: str) -> List[str]:
42
+ """Extract key medical phrases and terms"""
43
+ if not text:
44
+ return []
45
+
46
+ # Medical term patterns
47
+ medical_patterns = [
48
+ r'\b(?:symptoms?|diagnosis|treatment|therapy|medication|drug|disease|condition|syndrome)\b',
49
+ r'\b(?:patient|doctor|physician|medical|clinical|healthcare)\b',
50
+ r'\b(?:blood pressure|heart rate|temperature|pulse|respiration)\b',
51
+ r'\b(?:acute|chronic|severe|mild|moderate|serious|critical)\b',
52
+ r'\b(?:pain|ache|discomfort|swelling|inflammation|infection)\b'
53
+ ]
54
+
55
+ key_phrases = []
56
+ for pattern in medical_patterns:
57
+ matches = re.findall(pattern, text, re.IGNORECASE)
58
+ key_phrases.extend(matches)
59
+
60
+ return list(set(key_phrases)) # Remove duplicates
61
+
62
+ def summarize_text(self, text: str, max_length: int = 200) -> str:
63
+ """Summarize text using NVIDIA Llama model"""
64
+ try:
65
+ if not text or len(text.strip()) < 50:
66
+ return text
67
+
68
+ # Clean the text first
69
+ cleaned_text = self.clean_text(text)
70
+
71
+ # Extract key phrases for context
72
+ key_phrases = self.extract_key_phrases(cleaned_text)
73
+ key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "medical information"
74
+
75
+ # Create optimized prompt
76
+ prompt = f"""Summarize this medical text in {max_length} characters or less. Focus only on key medical facts, symptoms, treatments, and diagnoses. Do not include greetings, confirmations, or conversational elements.
77
+
78
+ Key terms: {key_phrases_str}
79
+
80
+ Text: {cleaned_text[:1500]}
81
+
82
+ Summary:"""
83
+
84
+ summary = self.llama_client._call_llama(prompt)
85
+
86
+ # Post-process summary
87
+ summary = self.clean_text(summary)
88
+
89
+ # Ensure it's within length limit
90
+ if len(summary) > max_length:
91
+ summary = summary[:max_length-3] + "..."
92
+
93
+ return summary
94
+
95
+ except Exception as e:
96
+ logger.error(f"Summarization failed: {e}")
97
+ # Fallback to simple truncation
98
+ return self.clean_text(text)[:max_length]
99
+
100
+ def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
101
+ """Summarize text focusing strictly on information relevant to the query.
102
+ Returns an empty string if nothing relevant is found.
103
+ """
104
+ try:
105
+ if not text:
106
+ return ""
107
+ cleaned_text = self.clean_text(text)
108
+ if not cleaned_text:
109
+ return ""
110
+
111
+ # Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
112
+ prompt = (
113
+ f"You extract only medically relevant facts that help answer: '{query}'. "
114
+ f"Respond with a concise bullet list (<= {max_length} chars total). "
115
+ "If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
116
+ f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
117
+ )
118
+
119
+ summary = self.llama_client._call_llama(prompt)
120
+ summary = self.clean_text(summary)
121
+ if not summary or summary.upper().strip() == "NONE":
122
+ return ""
123
+ if len(summary) > max_length:
124
+ summary = summary[:max_length-3] + "..."
125
+ return summary
126
+ except Exception as e:
127
+ logger.warning(f"Query-focused summarization failed: {e}")
128
+ return ""
129
+
130
+ def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
131
+ """Summarize multiple documents with URL mapping"""
132
+ try:
133
+ doc_summaries = []
134
+ url_mapping = {}
135
+
136
+ for doc in documents:
137
+ doc_id = doc['id']
138
+ url_mapping[doc_id] = doc['url']
139
+
140
+ # Create focused summary for each document
141
+ summary_prompt = f"""Summarize this medical document in 2-3 sentences, focusing on information relevant to: "{user_query}"
142
+
143
+ Document: {doc['title']}
144
+ Content: {doc['content'][:800]}
145
+
146
+ Key medical information:"""
147
+
148
+ summary = self.llama_client._call_llama(summary_prompt)
149
+ summary = self.clean_text(summary)
150
+
151
+ doc_summaries.append(f"Document {doc_id}: {summary}")
152
+
153
+ combined_summary = "\n\n".join(doc_summaries)
154
+ return combined_summary, url_mapping
155
+
156
+ except Exception as e:
157
+ logger.error(f"Document summarization failed: {e}")
158
+ return "", {}
159
+
160
+ def summarize_conversation_chunk(self, chunk: str) -> str:
161
+ """Summarize a conversation chunk for memory"""
162
+ try:
163
+ if not chunk or len(chunk.strip()) < 30:
164
+ return chunk
165
+
166
+ cleaned_chunk = self.clean_text(chunk)
167
+
168
+ prompt = f"""Summarize this medical conversation in 1-2 sentences. Focus only on medical facts, symptoms, treatments, or diagnoses discussed. Remove greetings and conversational elements.
169
+
170
+ Conversation: {cleaned_chunk[:1000]}
171
+
172
+ Medical summary:"""
173
+
174
+ summary = self.llama_client._call_llama(prompt)
175
+ return self.clean_text(summary)
176
+
177
+ except Exception as e:
178
+ logger.error(f"Conversation summarization failed: {e}")
179
+ return self.clean_text(chunk)[:150]
180
+
181
+ def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
182
+ """Split response into chunks and summarize each"""
183
+ try:
184
+ if not response or len(response) <= max_chunk_size:
185
+ return [response]
186
+
187
+ # Split by sentences first
188
+ sentences = re.split(r'[.!?]+', response)
189
+ chunks = []
190
+ current_chunk = ""
191
+
192
+ for sentence in sentences:
193
+ sentence = sentence.strip()
194
+ if not sentence:
195
+ continue
196
+
197
+ # Check if adding this sentence would exceed limit
198
+ if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
199
+ chunks.append(self.summarize_conversation_chunk(current_chunk))
200
+ current_chunk = sentence
201
+ else:
202
+ current_chunk += sentence + ". "
203
+
204
+ # Add the last chunk
205
+ if current_chunk:
206
+ chunks.append(self.summarize_conversation_chunk(current_chunk))
207
+
208
+ return chunks
209
+
210
+ except Exception as e:
211
+ logger.error(f"Response chunking failed: {e}")
212
+ return [response]
213
+
214
+ # Global summarizer instance
215
+ summarizer = TextSummarizer()
216
+
models/warmup.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import torch
3
+
4
+ print("🚀 Warming up model...")
5
+ embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
6
+ # embedding_model = embedding_model.half() # Reduce memory
7
+ embedding_model.to(torch.device("cpu"))
8
+ print("✅ Model warm-up complete!")
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt - Cooking Tutor API
2
+ # **LLMs**
3
+ google-genai
4
+ huggingface_hub
5
+ # **Memory & Embeddings**
6
+ faiss-cpu
7
+ sentence-transformers
8
+ # **Translation**
9
+ transformers
10
+ accelerate
11
+ sentencepiece
12
+ # **Environment**
13
+ python-dotenv
14
+ # **Deployment**
15
+ uvicorn
16
+ fastapi
17
+ torch # For translation models
18
+ psutil # System monitoring
19
+ # **Web Search**
20
+ requests
21
+ beautifulsoup4
22
+ langdetect
23
+ # **Data Processing**
24
+ pandas
25
+ numpy
search/.DS_Store ADDED
Binary file (6.15 kB). View file
 
search/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Search package
2
+ from .search import WebSearcher, search_web, search_web_with_content, search_medical, search_multilingual_medical, search_videos, search_comprehensive
3
+ from .coordinator import SearchCoordinator
4
+ from .engines import DuckDuckGoEngine, MedicalSearchEngine, MultilingualMedicalEngine, VideoSearchEngine
5
+ from .extractors import ContentExtractor
6
+ from .processors import MedicalSearchProcessor, LanguageProcessor, SourceAggregator, EnhancedContentProcessor
7
+
8
+ __all__ = [
9
+ 'WebSearcher',
10
+ 'search_web',
11
+ 'search_web_with_content',
12
+ 'search_medical',
13
+ 'search_multilingual_medical',
14
+ 'search_videos',
15
+ 'search_comprehensive',
16
+ 'SearchCoordinator',
17
+ 'DuckDuckGoEngine',
18
+ 'MedicalSearchEngine',
19
+ 'MultilingualMedicalEngine',
20
+ 'VideoSearchEngine',
21
+ 'ContentExtractor',
22
+ 'MedicalSearchProcessor',
23
+ 'LanguageProcessor',
24
+ 'SourceAggregator',
25
+ 'EnhancedContentProcessor'
26
+ ]
search/coordinator.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+ from .engines.duckduckgo import DuckDuckGoEngine
7
+ from .engines.cooking import CookingSearchEngine
8
+ from .engines.multilingual import MultilingualCookingEngine
9
+ from .engines.video import VideoSearchEngine
10
+ from .extractors.content import ContentExtractor
11
+ from .processors.cooking import CookingSearchProcessor
12
+ from .processors.language import LanguageProcessor
13
+ from .processors.sources import SourceAggregator
14
+ from .processors.enhanced import EnhancedContentProcessor
15
+ # Reranker removed - using simple relevance scoring for cooking content
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class SearchCoordinator:
20
+ """Coordinate multiple search strategies for comprehensive cooking information"""
21
+
22
+ def __init__(self, max_workers: int = 3):
23
+ self.max_workers = max_workers
24
+
25
+ # Initialize search engines
26
+ self.duckduckgo_engine = DuckDuckGoEngine()
27
+ self.cooking_engine = CookingSearchEngine()
28
+ self.multilingual_engine = MultilingualCookingEngine()
29
+ self.video_engine = VideoSearchEngine()
30
+
31
+ # Initialize processors
32
+ self.content_extractor = ContentExtractor()
33
+ self.cooking_processor = CookingSearchProcessor()
34
+ self.language_processor = LanguageProcessor()
35
+ self.source_aggregator = SourceAggregator()
36
+ self.enhanced_processor = EnhancedContentProcessor()
37
+ self.reranker = None # No complex reranking needed for cooking content
38
+
39
+ # Search strategies
40
+ self.strategies = [
41
+ self._search_multilingual,
42
+ self._search_duckduckgo,
43
+ self._search_cooking_sources
44
+ ]
45
+
46
+ def search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
47
+ """Execute comprehensive multilingual search with multiple strategies"""
48
+ logger.info(f"Starting comprehensive multilingual search for: {query}")
49
+
50
+ # Detect and enhance query for multiple languages
51
+ enhanced_queries = self.language_processor.enhance_query(query, target_language)
52
+ logger.info(f"Enhanced queries: {list(enhanced_queries.keys())}")
53
+
54
+ # Execute search strategies in parallel
55
+ all_results = []
56
+
57
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
58
+ # Submit search tasks for each language
59
+ future_to_strategy = {}
60
+
61
+ for lang, enhanced_query in enhanced_queries.items():
62
+ for strategy in self.strategies:
63
+ future = executor.submit(strategy, enhanced_query, num_results // len(enhanced_queries), lang)
64
+ future_to_strategy[future] = f"{strategy.__name__}_{lang}"
65
+
66
+ # Collect results
67
+ for future in as_completed(future_to_strategy):
68
+ strategy_name = future_to_strategy[future]
69
+ try:
70
+ results = future.result()
71
+ if results:
72
+ all_results.extend(results)
73
+ logger.info(f"{strategy_name} found {len(results)} results")
74
+ except Exception as e:
75
+ logger.error(f"{strategy_name} failed: {e}")
76
+
77
+ # Remove duplicates and filter by language preference
78
+ unique_results = self._remove_duplicates(all_results)
79
+ if target_language:
80
+ unique_results = self.language_processor.filter_by_language(unique_results, target_language)
81
+
82
+ logger.info(f"Total unique results: {len(unique_results)}")
83
+
84
+ # Extract content from URLs
85
+ enriched_results = self._enrich_with_content(unique_results)
86
+
87
+ # Simple cooking relevance filtering
88
+ if enriched_results:
89
+ cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
90
+ relevant_results = []
91
+ for result in enriched_results:
92
+ title = result.get('title', '').lower()
93
+ content = result.get('content', '').lower()
94
+ if any(keyword in title or keyword in content for keyword in cooking_keywords):
95
+ relevant_results.append(result)
96
+
97
+ if relevant_results:
98
+ enriched_results = relevant_results
99
+ logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
100
+
101
+ # Process results into comprehensive summary
102
+ summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
103
+
104
+ logger.info(f"Multilingual search completed: {len(url_mapping)} sources processed")
105
+ return summary, url_mapping
106
+
107
+ def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
108
+ """Search using multilingual medical engine"""
109
+ try:
110
+ if language:
111
+ results = self.multilingual_engine.search_by_language(query, language, num_results)
112
+ else:
113
+ results = self.multilingual_engine.search(query, num_results)
114
+ return results
115
+ except Exception as e:
116
+ logger.error(f"Multilingual search failed: {e}")
117
+ return []
118
+
119
+ def _search_duckduckgo(self, query: str, num_results: int, language: str = None) -> List[Dict]:
120
+ """Search using DuckDuckGo engine"""
121
+ try:
122
+ results = self.duckduckgo_engine.search(query, num_results)
123
+ return results
124
+ except Exception as e:
125
+ logger.error(f"DuckDuckGo search failed: {e}")
126
+ return []
127
+
128
+ def _search_cooking_sources(self, query: str, num_results: int, language: str = None) -> List[Dict]:
129
+ """Search using cooking sources engine"""
130
+ try:
131
+ results = self.cooking_engine.search(query, num_results)
132
+ return results
133
+ except Exception as e:
134
+ logger.error(f"Cooking sources search failed: {e}")
135
+ return []
136
+
137
+ def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
138
+ """Remove duplicate results based on URL"""
139
+ seen_urls = set()
140
+ unique_results = []
141
+
142
+ for result in results:
143
+ url = result.get('url', '')
144
+ if url and url not in seen_urls:
145
+ seen_urls.add(url)
146
+ unique_results.append(result)
147
+
148
+ return unique_results
149
+
150
+ def _enrich_with_content(self, results: List[Dict]) -> List[Dict]:
151
+ """Enrich results with extracted content"""
152
+ enriched_results = []
153
+
154
+ # Extract content in parallel
155
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
156
+ # Submit content extraction tasks
157
+ future_to_result = {
158
+ executor.submit(self.content_extractor.extract, result['url']): result
159
+ for result in results
160
+ }
161
+
162
+ # Collect enriched results
163
+ for future in as_completed(future_to_result):
164
+ original_result = future_to_result[future]
165
+ try:
166
+ content = future.result()
167
+ if content:
168
+ enriched_result = original_result.copy()
169
+ enriched_result['content'] = content
170
+ enriched_results.append(enriched_result)
171
+ except Exception as e:
172
+ logger.warning(f"Content extraction failed for {original_result['url']}: {e}")
173
+ # Still include result without content
174
+ enriched_results.append(original_result)
175
+
176
+ return enriched_results
177
+
178
+ def quick_search(self, query: str, num_results: int = 5) -> List[Dict]:
179
+ """Quick search for basic results without content extraction"""
180
+ logger.info(f"Quick search for: {query}")
181
+
182
+ # Use only DuckDuckGo for speed
183
+ results = self.duckduckgo_engine.search(query, num_results)
184
+
185
+ # If no results, try with simplified query
186
+ if not results:
187
+ logger.warning("No results from DuckDuckGo, trying simplified query")
188
+ simplified_query = self._simplify_query(query)
189
+ if simplified_query != query:
190
+ results = self.duckduckgo_engine.search(simplified_query, num_results)
191
+ logger.info(f"Simplified query '{simplified_query}' found {len(results)} results")
192
+
193
+ # If still no results, try cooking engine as fallback
194
+ if not results:
195
+ logger.warning("Still no results, trying cooking engine fallback")
196
+ try:
197
+ cooking_results = self.cooking_engine.search(query, num_results)
198
+ if cooking_results:
199
+ results = cooking_results
200
+ logger.info(f"Cooking engine fallback found {len(results)} results")
201
+ except Exception as e:
202
+ logger.warning(f"Cooking engine fallback failed: {e}")
203
+
204
+ # Remove duplicates
205
+ unique_results = self._remove_duplicates(results)
206
+
207
+ # If we still have no results, create a basic fallback
208
+ if not unique_results:
209
+ logger.warning("No search results found, creating basic fallback")
210
+ unique_results = self._create_fallback_results(query)
211
+
212
+ logger.info(f"Quick search completed: {len(unique_results)} results")
213
+ return unique_results
214
+
215
+ def _simplify_query(self, query: str) -> str:
216
+ """Simplify query to core cooking terms"""
217
+ if not query:
218
+ return ""
219
+
220
+ # Extract key cooking terms
221
+ import re
222
+ words = query.split()
223
+
224
+ # Keep cooking keywords and important terms
225
+ cooking_keywords = [
226
+ 'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
227
+ 'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
228
+ 'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
229
+ 'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
230
+ 'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
231
+ 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
232
+ 'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai'
233
+ ]
234
+
235
+ # Keep words that are cooking keywords or are important (longer than 3 chars)
236
+ important_words = []
237
+ for word in words:
238
+ word_lower = word.lower()
239
+ if word_lower in cooking_keywords or len(word) > 3:
240
+ important_words.append(word)
241
+
242
+ # If we have important words, use them; otherwise use first few words
243
+ if important_words:
244
+ return ' '.join(important_words[:5]) # Max 5 words
245
+ else:
246
+ return ' '.join(words[:3]) # Max 3 words
247
+
248
+ def _create_fallback_results(self, query: str) -> List[Dict]:
249
+ """Create basic fallback results when search fails"""
250
+ # Create some basic cooking information URLs as fallback
251
+ fallback_urls = [
252
+ "https://www.allrecipes.com",
253
+ "https://www.foodnetwork.com",
254
+ "https://www.epicurious.com",
255
+ "https://www.seriouseats.com",
256
+ "https://www.bonappetit.com"
257
+ ]
258
+
259
+ results = []
260
+ for i, url in enumerate(fallback_urls[:3]): # Limit to 3 fallback results
261
+ results.append({
262
+ 'url': url,
263
+ 'title': f"Cooking Information - {query}",
264
+ 'source': 'fallback',
265
+ 'composite_score': 0.3 - (i * 0.05) # Decreasing score
266
+ })
267
+
268
+ return results
269
+
270
+ def cooking_focus_search(self, query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
271
+ """Cooking-focused search with enhanced processing"""
272
+ logger.info(f"Cooking focus search for: {query}")
273
+
274
+ # Use cooking engine primarily
275
+ cooking_results = self.cooking_engine.search(query, num_results)
276
+
277
+ # Add some general results for context
278
+ general_results = self.duckduckgo_engine.search(query, 3)
279
+
280
+ # Combine and deduplicate
281
+ all_results = self._remove_duplicates(cooking_results + general_results)
282
+
283
+ # Enrich with content
284
+ enriched_results = self._enrich_with_content(all_results)
285
+
286
+ # Simple cooking relevance filtering
287
+ if enriched_results:
288
+ cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
289
+ relevant_results = []
290
+ for result in enriched_results:
291
+ title = result.get('title', '').lower()
292
+ content = result.get('content', '').lower()
293
+ if any(keyword in title or keyword in content for keyword in cooking_keywords):
294
+ relevant_results.append(result)
295
+
296
+ if relevant_results:
297
+ enriched_results = relevant_results
298
+ logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
299
+
300
+ # Process with cooking focus
301
+ summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
302
+
303
+ logger.info(f"Cooking focus search completed: {len(url_mapping)} sources")
304
+ return summary, url_mapping
305
+
306
+ def multilingual_cooking_search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
307
+ """Comprehensive multilingual cooking search"""
308
+ logger.info(f"Multilingual cooking search for: {query} (target: {target_language})")
309
+
310
+ # Detect source language
311
+ source_language = self.language_processor.detect_language(query)
312
+ logger.info(f"Detected source language: {source_language}")
313
+
314
+ # Use multilingual search with language preference
315
+ summary, url_mapping = self.search(query, num_results, target_language)
316
+
317
+ logger.info(f"Multilingual cooking search completed: {len(url_mapping)} sources")
318
+ return summary, url_mapping
319
+
320
+ def comprehensive_search(self, query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
321
+ """Comprehensive search with maximum information extraction and detailed references"""
322
+ logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
323
+
324
+ # Detect source language
325
+ source_language = self.language_processor.detect_language(query)
326
+ logger.info(f"Detected source language: {source_language}")
327
+
328
+ # Execute comprehensive search
329
+ search_results = []
330
+ video_results = []
331
+
332
+ # 1. Multilingual text search
333
+ text_summary, text_url_mapping = self.search(query, num_results, target_language)
334
+
335
+ # 2. Video search if requested
336
+ if include_videos:
337
+ try:
338
+ video_results = self.video_search(query, num_results=5, target_language=target_language)
339
+ logger.info(f"Video search found {len(video_results)} videos")
340
+ except Exception as e:
341
+ logger.warning(f"Video search failed: {e}")
342
+
343
+ # 3. Aggregate all sources
344
+ all_sources = []
345
+
346
+ # Add text sources
347
+ for i, url in text_url_mapping.items():
348
+ # Find corresponding source data
349
+ source_data = self._find_source_data(url, text_url_mapping)
350
+ if source_data:
351
+ all_sources.append(source_data)
352
+
353
+ # Add video sources
354
+ for video in video_results:
355
+ all_sources.append(video)
356
+
357
+ # 4. Process with enhanced content processor
358
+ if all_sources:
359
+ comprehensive_summary, detailed_mapping = self.enhanced_processor.process_comprehensive_content(all_sources, query)
360
+ else:
361
+ comprehensive_summary = text_summary
362
+ detailed_mapping = text_url_mapping
363
+
364
+ # 5. Create comprehensive source aggregation
365
+ source_aggregation = self.source_aggregator.aggregate_sources(all_sources, video_results)
366
+
367
+ # 6. Generate comprehensive references
368
+ comprehensive_references = self.source_aggregator.create_comprehensive_references(all_sources, max_references=20)
369
+
370
+ # 7. Add inline citations
371
+ final_summary = self.enhanced_processor.create_inline_citations(comprehensive_summary, detailed_mapping)
372
+
373
+ # 8. Add source statistics
374
+ source_stats = self.enhanced_processor.generate_source_statistics(all_sources)
375
+
376
+ # 9. Combine everything
377
+ final_response = f"{final_summary}\n\n{comprehensive_references}\n\n{source_stats}"
378
+
379
+ logger.info(f"Comprehensive search completed: {len(all_sources)} total sources processed")
380
+
381
+ return final_response, detailed_mapping, source_aggregation
382
+
383
+ def _find_source_data(self, url: str, url_mapping: Dict[int, str]) -> Dict:
384
+ """Find source data for a given URL"""
385
+ # This is a simplified version - ensure required fields always exist
386
+ return {
387
+ 'url': url,
388
+ 'title': f"Source: {url}",
389
+ 'content': '',
390
+ 'domain': self._extract_domain(url),
391
+ 'type': 'text',
392
+ 'source_type': 'text',
393
+ 'language': 'en',
394
+ 'source_name': '',
395
+ 'platform': ''
396
+ }
397
+
398
+ def _extract_domain(self, url: str) -> str:
399
+ """Extract domain from URL"""
400
+ try:
401
+ from urllib.parse import urlparse
402
+ parsed = urlparse(url)
403
+ domain = parsed.netloc.lower()
404
+ if domain.startswith('www.'):
405
+ domain = domain[4:]
406
+ return domain
407
+ except:
408
+ return ''
409
+
410
+ def video_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
411
+ """Search for cooking videos across multiple platforms"""
412
+ logger.info(f"Video search for: {query} (target: {target_language})")
413
+
414
+ # Detect language if not provided
415
+ if not target_language:
416
+ target_language = self.language_processor.detect_language(query)
417
+
418
+ # Map language codes
419
+ lang_mapping = {
420
+ 'EN': 'en',
421
+ 'VI': 'vi',
422
+ 'ZH': 'zh',
423
+ 'en': 'en',
424
+ 'vi': 'vi',
425
+ 'zh': 'zh'
426
+ }
427
+ search_language = lang_mapping.get(target_language, 'en')
428
+
429
+ # Search for videos
430
+ raw_results = self.video_engine.search(query, num_results, search_language)
431
+
432
+ # Simple video relevance filtering
433
+ cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
434
+ filtered_video_results = []
435
+ for result in raw_results:
436
+ title = result.get('title', '').lower()
437
+ if any(keyword in title for keyword in cooking_keywords):
438
+ filtered_video_results.append(result)
439
+
440
+ # Validate and normalize results to avoid corrupted cards/links
441
+ video_results = self._sanitize_video_results(filtered_video_results, limit=num_results)
442
+
443
+ logger.info(f"Video search completed: {len(video_results)} videos found")
444
+ return video_results
445
+
446
+ def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
447
+ """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
448
+ Drop unreachable/broken items and deduplicate by URL.
449
+ """
450
+ from urllib.parse import urlparse
451
+ import requests
452
+ clean: List[Dict] = []
453
+ seen = set()
454
+ for item in results or []:
455
+ url = (item or {}).get('url', '')
456
+ title = (item or {}).get('title', '').strip()
457
+ if not url or not title:
458
+ continue
459
+ try:
460
+ parsed = urlparse(url)
461
+ if parsed.scheme not in ('http', 'https'):
462
+ continue
463
+ if not parsed.netloc:
464
+ continue
465
+ # Quick reachability check; YouTube often blocks HEAD, so skip strict checks for youtube domain
466
+ host = parsed.netloc.lower()
467
+ norm_url = url
468
+ if 'youtube.com' not in host:
469
+ try:
470
+ r = requests.head(url, allow_redirects=True, timeout=3)
471
+ if r.status_code >= 400:
472
+ continue
473
+ norm_url = getattr(r, 'url', url) or url
474
+ except Exception:
475
+ # If HEAD blocked, try a light GET with small timeout
476
+ try:
477
+ r = requests.get(url, stream=True, timeout=4)
478
+ if r.status_code >= 400:
479
+ continue
480
+ norm_url = getattr(r, 'url', url) or url
481
+ except Exception:
482
+ continue
483
+ if norm_url in seen:
484
+ continue
485
+ seen.add(norm_url)
486
+ platform = parsed.netloc.lower()
487
+ if platform.startswith('www.'):
488
+ platform = platform[4:]
489
+ clean.append({
490
+ 'title': title,
491
+ 'url': norm_url,
492
+ 'thumbnail': item.get('thumbnail', ''),
493
+ 'source': item.get('source', platform.split('.')[0]),
494
+ 'platform': platform,
495
+ 'language': item.get('language', 'en')
496
+ })
497
+ if len(clean) >= limit:
498
+ break
499
+ except Exception:
500
+ continue
501
+ return clean
502
+
503
+
504
+
search/engines/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .duckduckgo import DuckDuckGoEngine
2
+ from .medical import MedicalSearchEngine
3
+ from .multilingual import MultilingualMedicalEngine
4
+ from .video import VideoSearchEngine
5
+
6
+ __all__ = ['DuckDuckGoEngine', 'MedicalSearchEngine', 'MultilingualMedicalEngine', 'VideoSearchEngine']
search/engines/cooking.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class CookingSearchEngine:
10
+ """Specialized cooking search engine with curated sources"""
11
+
12
+ def __init__(self, timeout: int = 15):
13
+ self.session = requests.Session()
14
+ self.session.headers.update({
15
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
16
+ })
17
+ self.timeout = timeout
18
+
19
+ # Curated cooking sources
20
+ self.cooking_sources = {
21
+ 'allrecipes': {
22
+ 'base_url': 'https://www.allrecipes.com',
23
+ 'search_url': 'https://www.allrecipes.com/search',
24
+ 'domains': ['allrecipes.com']
25
+ },
26
+ 'food_network': {
27
+ 'base_url': 'https://www.foodnetwork.com',
28
+ 'search_url': 'https://www.foodnetwork.com/search',
29
+ 'domains': ['foodnetwork.com']
30
+ },
31
+ 'epicurious': {
32
+ 'base_url': 'https://www.epicurious.com',
33
+ 'search_url': 'https://www.epicurious.com/search',
34
+ 'domains': ['epicurious.com']
35
+ },
36
+ 'serious_eats': {
37
+ 'base_url': 'https://www.seriouseats.com',
38
+ 'search_url': 'https://www.seriouseats.com/search',
39
+ 'domains': ['seriouseats.com']
40
+ },
41
+ 'bon_appetit': {
42
+ 'base_url': 'https://www.bonappetit.com',
43
+ 'search_url': 'https://www.bonappetit.com/search',
44
+ 'domains': ['bonappetit.com']
45
+ }
46
+ }
47
+
48
+ def search(self, query: str, num_results: int = 10) -> List[Dict]:
49
+ """Search cooking sources for relevant information"""
50
+ results = []
51
+
52
+ # Strategy 1: Direct cooking source searches
53
+ for source_name, source_config in self.cooking_sources.items():
54
+ if len(results) >= num_results:
55
+ break
56
+
57
+ source_results = self._search_cooking_source(query, source_name, source_config)
58
+ results.extend(source_results)
59
+
60
+ # Add delay between requests
61
+ time.sleep(0.5)
62
+
63
+ # Strategy 2: Cooking fallback sources
64
+ if len(results) < num_results:
65
+ fallback_results = self._get_fallback_sources(query, num_results - len(results))
66
+ results.extend(fallback_results)
67
+
68
+ return results[:num_results]
69
+
70
+ def _search_cooking_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
71
+ """Search a specific cooking source"""
72
+ try:
73
+ search_url = source_config.get('search_url')
74
+ if not search_url:
75
+ return []
76
+
77
+ params = {
78
+ 'q': query,
79
+ 'query': query,
80
+ 'search': query
81
+ }
82
+
83
+ response = self.session.get(search_url, params=params, timeout=self.timeout)
84
+ response.raise_for_status()
85
+
86
+ soup = BeautifulSoup(response.content, 'html.parser')
87
+ results = []
88
+
89
+ # Source-specific selectors
90
+ selectors = self._get_source_selectors(source_name)
91
+
92
+ for selector in selectors:
93
+ links = soup.select(selector)
94
+ if links:
95
+ logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
96
+ break
97
+
98
+ for link in links[:3]: # Limit per source
99
+ try:
100
+ href = link.get('href')
101
+ if not href:
102
+ continue
103
+
104
+ # Make absolute URL
105
+ if href.startswith('/'):
106
+ href = source_config['base_url'] + href
107
+
108
+ title = link.get_text(strip=True)
109
+ if title and href.startswith('http'):
110
+ results.append({
111
+ 'url': href,
112
+ 'title': title,
113
+ 'source': source_name,
114
+ 'domain': source_config['domains'][0]
115
+ })
116
+ except Exception as e:
117
+ logger.debug(f"Error parsing {source_name} link: {e}")
118
+ continue
119
+
120
+ return results
121
+
122
+ except Exception as e:
123
+ logger.warning(f"Cooking source {source_name} search failed: {e}")
124
+ return []
125
+
126
+ def _get_source_selectors(self, source_name: str) -> List[str]:
127
+ """Get CSS selectors for specific cooking sources"""
128
+ selectors_map = {
129
+ 'allrecipes': [
130
+ 'a[href*="/recipe/"]',
131
+ 'a[href*="/recipes/"]',
132
+ '.search-result a',
133
+ '.result-title a'
134
+ ],
135
+ 'food_network': [
136
+ 'a[href*="/recipes/"]',
137
+ '.search-result a',
138
+ '.result-title a',
139
+ 'a[href*="/recipe/"]'
140
+ ],
141
+ 'epicurious': [
142
+ 'a[href*="/recipes/"]',
143
+ '.search-result a',
144
+ '.result-title a',
145
+ 'a[href*="/recipe/"]'
146
+ ],
147
+ 'serious_eats': [
148
+ 'a[href*="/recipes/"]',
149
+ '.search-result a',
150
+ '.result-title a',
151
+ 'a[href*="/recipe/"]'
152
+ ],
153
+ 'bon_appetit': [
154
+ 'a[href*="/recipes/"]',
155
+ '.search-result a',
156
+ '.result-title a',
157
+ 'a[href*="/recipe/"]'
158
+ ]
159
+ }
160
+ return selectors_map.get(source_name, ['a[href*="http"]'])
161
+
162
+ def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
163
+ """Get fallback cooking sources when direct search fails"""
164
+ fallback_sources = [
165
+ {
166
+ 'url': 'https://www.allrecipes.com/recipes',
167
+ 'title': f'AllRecipes: {query}',
168
+ 'source': 'allrecipes_fallback',
169
+ 'domain': 'allrecipes.com'
170
+ },
171
+ {
172
+ 'url': 'https://www.foodnetwork.com/recipes',
173
+ 'title': f'Food Network: {query}',
174
+ 'source': 'foodnetwork_fallback',
175
+ 'domain': 'foodnetwork.com'
176
+ },
177
+ {
178
+ 'url': 'https://www.epicurious.com/recipes-menus',
179
+ 'title': f'Epicurious: {query}',
180
+ 'source': 'epicurious_fallback',
181
+ 'domain': 'epicurious.com'
182
+ },
183
+ {
184
+ 'url': 'https://www.seriouseats.com/recipes',
185
+ 'title': f'Serious Eats: {query}',
186
+ 'source': 'seriouseats_fallback',
187
+ 'domain': 'seriouseats.com'
188
+ },
189
+ {
190
+ 'url': 'https://www.bonappetit.com/recipes',
191
+ 'title': f'Bon Appétit: {query}',
192
+ 'source': 'bonappetit_fallback',
193
+ 'domain': 'bonappetit.com'
194
+ }
195
+ ]
196
+
197
+ return fallback_sources[:num_results]
search/engines/duckduckgo.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+ from models.reranker import MedicalReranker
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class DuckDuckGoEngine:
11
+ """DuckDuckGo search engine with multiple strategies"""
12
+
13
+ def __init__(self, timeout: int = 15):
14
+ self.session = requests.Session()
15
+ self.session.headers.update({
16
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
17
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
18
+ 'Accept-Language': 'en-US,en;q=0.5',
19
+ 'Accept-Encoding': 'gzip, deflate',
20
+ 'Connection': 'keep-alive',
21
+ 'Upgrade-Insecure-Requests': '1',
22
+ })
23
+ self.timeout = timeout
24
+ self.reranker = MedicalReranker()
25
+
26
+ def search(self, query: str, num_results: int = 10) -> List[Dict]:
27
+ """Search with multiple DuckDuckGo strategies and medical focus"""
28
+ # Clean and simplify the query first
29
+ clean_query = self._clean_query(query)
30
+ logger.info(f"Cleaned query: '{query}' -> '{clean_query}'")
31
+
32
+ results = []
33
+ min_score = 0.15 # Reduced from 0.3 to be less strict
34
+
35
+ # Strategy 1: HTML Interface with medical focus
36
+ html_results = self._search_html(clean_query, num_results * 3) # Get more to filter
37
+ if html_results:
38
+ results.extend(html_results)
39
+ logger.info(f"DuckDuckGo HTML found {len(html_results)} results")
40
+
41
+ # Strategy 2: Instant Answer API
42
+ if len(results) < num_results * 2:
43
+ api_results = self._search_api(clean_query, num_results)
44
+ if api_results:
45
+ results.extend(api_results)
46
+ logger.info(f"DuckDuckGo API found {len(api_results)} results")
47
+
48
+ # Strategy 3: Lite Interface (mobile-friendly)
49
+ if len(results) < num_results * 2:
50
+ lite_results = self._search_lite(clean_query, num_results)
51
+ if lite_results:
52
+ results.extend(lite_results)
53
+ logger.info(f"DuckDuckGo Lite found {len(lite_results)} results")
54
+
55
+ # If still no results, try with even simpler query
56
+ if not results:
57
+ simple_query = self._simplify_query(clean_query)
58
+ if simple_query != clean_query:
59
+ logger.info(f"Trying simplified query: '{simple_query}'")
60
+ html_results = self._search_html(simple_query, num_results * 2)
61
+ if html_results:
62
+ results.extend(html_results)
63
+ logger.info(f"Simplified query found {len(html_results)} results")
64
+
65
+ # If still no results, try fallback search engines
66
+ if not results:
67
+ logger.warning("DuckDuckGo failed, trying fallback search engines")
68
+ fallback_results = self._fallback_search(clean_query, num_results)
69
+ if fallback_results:
70
+ results.extend(fallback_results)
71
+ logger.info(f"Fallback search found {len(fallback_results)} results")
72
+
73
+ # Filter out irrelevant results first (less aggressive)
74
+ filtered_results = self._filter_irrelevant_sources(results)
75
+ logger.info(f"Filtered {len(results)} results to {len(filtered_results)} relevant results")
76
+
77
+ # If we have results, use reranker; otherwise return what we have
78
+ if filtered_results:
79
+ try:
80
+ reranked_results = self.reranker.rerank_results(clean_query, filtered_results, min_score)
81
+ logger.info(f"Reranked {len(filtered_results)} results to {len(reranked_results)} high-quality results")
82
+
83
+ # If reranking filtered out too many results, be more lenient
84
+ if len(reranked_results) < min(3, num_results) and len(filtered_results) > 0:
85
+ logger.warning(f"Reranking too strict ({len(reranked_results)} results), using fallback with lower threshold")
86
+ # Try with even lower threshold
87
+ fallback_results = self.reranker.rerank_results(clean_query, filtered_results, 0.05)
88
+ if len(fallback_results) > len(reranked_results):
89
+ return fallback_results[:num_results]
90
+ else:
91
+ # Last resort: return original filtered results with basic scoring
92
+ for i, result in enumerate(filtered_results[:num_results]):
93
+ result['composite_score'] = 0.5 - (i * 0.05) # Decreasing score
94
+ return filtered_results[:num_results]
95
+
96
+ return reranked_results[:num_results]
97
+ except Exception as e:
98
+ logger.warning(f"Reranking failed: {e}, returning filtered results")
99
+ return filtered_results[:num_results]
100
+
101
+ return filtered_results[:num_results]
102
+
103
+ def _clean_query(self, query: str) -> str:
104
+ """Clean and normalize search query"""
105
+ if not query:
106
+ return ""
107
+
108
+ # Remove bullet points and special characters
109
+ import re
110
+ cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query) # Remove bullet points
111
+ cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned) # Keep only alphanumeric, spaces, hyphens, dots
112
+ cleaned = re.sub(r'\s+', ' ', cleaned) # Normalize whitespace
113
+ cleaned = cleaned.strip()
114
+
115
+ # Remove common prefixes that might confuse search
116
+ prefixes_to_remove = [
117
+ r'^(en|vi|zh)\s*:\s*',
118
+ r'^(search|find|look for)\s+',
119
+ r'^(how to|what is|what are)\s+',
120
+ ]
121
+
122
+ for prefix in prefixes_to_remove:
123
+ cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
124
+
125
+ return cleaned.strip()
126
+
127
+ def _simplify_query(self, query: str) -> str:
128
+ """Simplify query to core medical terms"""
129
+ if not query:
130
+ return ""
131
+
132
+ # Extract key medical terms
133
+ import re
134
+ words = query.split()
135
+
136
+ # Keep medical keywords and important terms
137
+ medical_keywords = [
138
+ 'migraine', 'headache', 'pain', 'treatment', 'therapy', 'medication', 'drug',
139
+ 'chronic', 'acute', 'symptoms', 'diagnosis', 'prevention', 'management',
140
+ 'disease', 'condition', 'syndrome', 'disorder', 'infection', 'inflammation',
141
+ 'blood', 'heart', 'lung', 'brain', 'liver', 'kidney', 'diabetes', 'cancer',
142
+ 'covid', 'flu', 'cold', 'fever', 'cough', 'breathing', 'chest', 'stomach'
143
+ ]
144
+
145
+ # Keep words that are medical keywords or are important (longer than 3 chars)
146
+ important_words = []
147
+ for word in words:
148
+ word_lower = word.lower()
149
+ if word_lower in medical_keywords or len(word) > 3:
150
+ important_words.append(word)
151
+
152
+ # If we have important words, use them; otherwise use first few words
153
+ if important_words:
154
+ return ' '.join(important_words[:5]) # Max 5 words
155
+ else:
156
+ return ' '.join(words[:3]) # Max 3 words
157
+
158
+ def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
159
+ """Filter out irrelevant sources like generic health pages, quizzes, etc."""
160
+ import re
161
+ filtered = []
162
+
163
+ # Only exclude obvious non-medical content
164
+ exclude_patterns = [
165
+ r'/quiz$', # Quiz pages (end of URL)
166
+ r'/test$', # Test pages (end of URL)
167
+ r'/assessment', # Assessment pages
168
+ r'/survey', # Survey pages
169
+ r'homepage|main page|index', # Homepage/index pages
170
+ r'login|sign.up|register', # Auth pages
171
+ r'contact|about.us|privacy', # Info pages
172
+ r'subscribe|newsletter|rss', # Subscription pages
173
+ r'sitemap', # Navigation pages
174
+ ]
175
+
176
+ for result in results:
177
+ url = result.get('url', '').lower()
178
+ title = result.get('title', '').lower()
179
+
180
+ # Skip if matches exclude patterns
181
+ should_exclude = False
182
+ for pattern in exclude_patterns:
183
+ if re.search(pattern, url) or re.search(pattern, title):
184
+ should_exclude = True
185
+ logger.debug(f"Excluding irrelevant source: {url}")
186
+ break
187
+
188
+ if not should_exclude:
189
+ filtered.append(result)
190
+
191
+ # If we filtered out too many, be less aggressive
192
+ if len(filtered) < len(results) * 0.3: # If we kept less than 30%
193
+ logger.warning(f"Filtering too aggressive, keeping more results: {len(results)} -> {len(filtered)}")
194
+ # Return original results with minimal filtering
195
+ minimal_filtered = []
196
+ for result in results:
197
+ url = result.get('url', '').lower()
198
+ if not any(re.search(pattern, url) for pattern in [r'login', r'sign.up', r'register']):
199
+ minimal_filtered.append(result)
200
+ return minimal_filtered
201
+
202
+ return filtered
203
+
204
+ def _search_html(self, query: str, num_results: int) -> List[Dict]:
205
+ """Search using DuckDuckGo HTML interface with better error handling"""
206
+ try:
207
+ # Try multiple DuckDuckGo endpoints
208
+ endpoints = [
209
+ {
210
+ 'url': 'https://html.duckduckgo.com/html/',
211
+ 'params': {
212
+ 'q': query,
213
+ 'kl': 'us-en',
214
+ 's': '0',
215
+ 'dc': '1',
216
+ 'v': 'l'
217
+ }
218
+ },
219
+ {
220
+ 'url': 'https://lite.duckduckgo.com/lite/',
221
+ 'params': {
222
+ 'q': query,
223
+ 'kl': 'us-en'
224
+ }
225
+ },
226
+ {
227
+ 'url': 'https://duckduckgo.com/html/',
228
+ 'params': {
229
+ 'q': query,
230
+ 'kl': 'us-en'
231
+ }
232
+ }
233
+ ]
234
+
235
+ for endpoint in endpoints:
236
+ try:
237
+ # Add random delay to avoid rate limiting
238
+ import time
239
+ time.sleep(0.5)
240
+
241
+ # Update headers to look more like a real browser
242
+ headers = self.session.headers.copy()
243
+ headers.update({
244
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
245
+ 'Accept-Language': 'en-US,en;q=0.5',
246
+ 'Accept-Encoding': 'gzip, deflate',
247
+ 'DNT': '1',
248
+ 'Connection': 'keep-alive',
249
+ 'Upgrade-Insecure-Requests': '1',
250
+ })
251
+
252
+ response = self.session.get(
253
+ endpoint['url'],
254
+ params=endpoint['params'],
255
+ headers=headers,
256
+ timeout=self.timeout
257
+ )
258
+
259
+ if response.status_code == 403:
260
+ logger.warning(f"DuckDuckGo endpoint {endpoint['url']} returned 403, trying next...")
261
+ continue
262
+ elif response.status_code == 429:
263
+ logger.warning(f"DuckDuckGo rate limited, waiting...")
264
+ time.sleep(2)
265
+ continue
266
+
267
+ except Exception as e:
268
+ logger.warning(f"DuckDuckGo endpoint {endpoint['url']} failed: {e}")
269
+ if endpoint == endpoints[-1]: # Last endpoint
270
+ raise e
271
+ continue
272
+ else:
273
+ # All endpoints failed
274
+ logger.error("All DuckDuckGo endpoints failed")
275
+ return []
276
+
277
+ soup = BeautifulSoup(response.content, 'html.parser')
278
+ results = []
279
+
280
+ # Multiple selectors for different DDG layouts
281
+ selectors = [
282
+ 'a.result__a',
283
+ 'a[data-testid="result-title-a"]',
284
+ '.result__title a',
285
+ '.web-result a',
286
+ '.result a',
287
+ 'a[href*="http"]:not([href*="duckduckgo.com"])'
288
+ ]
289
+
290
+ for selector in selectors:
291
+ links = soup.select(selector)
292
+ if links:
293
+ logger.info(f"Using selector: {selector} - found {len(links)} links")
294
+ break
295
+
296
+ for link in links[:num_results]:
297
+ try:
298
+ href = link.get('href')
299
+ if not href or href.startswith('#') or 'duckduckgo.com' in href:
300
+ continue
301
+
302
+ # Clean up DDG redirect URLs
303
+ if href.startswith('/l/?uddg='):
304
+ import urllib.parse
305
+ href = urllib.parse.unquote(href.split('uddg=')[1])
306
+
307
+ title = link.get_text(strip=True)
308
+ if title and href.startswith('http'):
309
+ results.append({
310
+ 'url': href,
311
+ 'title': title,
312
+ 'source': 'duckduckgo_html'
313
+ })
314
+ except Exception as e:
315
+ logger.debug(f"Error parsing link: {e}")
316
+ continue
317
+
318
+ return results
319
+
320
+ except Exception as e:
321
+ logger.warning(f"DuckDuckGo HTML search failed: {e}")
322
+ return []
323
+
324
+ def _search_api(self, query: str, num_results: int) -> List[Dict]:
325
+ """Search using DuckDuckGo Instant Answer API"""
326
+ try:
327
+ url = "https://api.duckduckgo.com/"
328
+ params = {
329
+ 'q': query,
330
+ 'format': 'json',
331
+ 'no_html': '1',
332
+ 'skip_disambig': '1',
333
+ 't': 'MedicalChatbot'
334
+ }
335
+
336
+ response = self.session.get(url, params=params, timeout=self.timeout)
337
+ response.raise_for_status()
338
+ data = response.json()
339
+
340
+ results = []
341
+
342
+ # Abstract result
343
+ if data.get('AbstractURL') and data.get('Abstract'):
344
+ results.append({
345
+ 'url': data['AbstractURL'],
346
+ 'title': data.get('Heading', query),
347
+ 'content': data.get('Abstract', ''),
348
+ 'source': 'duckduckgo_api'
349
+ })
350
+
351
+ # Related topics
352
+ for topic in data.get('RelatedTopics', []):
353
+ if len(results) >= num_results:
354
+ break
355
+
356
+ if isinstance(topic, dict) and topic.get('FirstURL'):
357
+ text = topic.get('Text', '')
358
+ title = text.split(' - ')[0] if ' - ' in text else text[:50]
359
+
360
+ results.append({
361
+ 'url': topic['FirstURL'],
362
+ 'title': title,
363
+ 'content': text,
364
+ 'source': 'duckduckgo_api'
365
+ })
366
+
367
+ return results
368
+
369
+ except Exception as e:
370
+ logger.warning(f"DuckDuckGo API search failed: {e}")
371
+ return []
372
+
373
+ def _search_lite(self, query: str, num_results: int) -> List[Dict]:
374
+ """Search using DuckDuckGo Lite interface"""
375
+ try:
376
+ url = "https://lite.duckduckgo.com/lite/"
377
+ params = {
378
+ 'q': query,
379
+ 'kl': 'us-en'
380
+ }
381
+
382
+ response = self.session.get(url, params=params, timeout=self.timeout)
383
+ response.raise_for_status()
384
+
385
+ soup = BeautifulSoup(response.content, 'html.parser')
386
+ results = []
387
+
388
+ # Lite interface selectors
389
+ links = soup.select('a[href*="http"]:not([href*="duckduckgo.com"])')
390
+
391
+ for link in links[:num_results]:
392
+ try:
393
+ href = link.get('href')
394
+ title = link.get_text(strip=True)
395
+
396
+ if href and title and href.startswith('http'):
397
+ results.append({
398
+ 'url': href,
399
+ 'title': title,
400
+ 'source': 'duckduckgo_lite'
401
+ })
402
+ except Exception as e:
403
+ logger.debug(f"Error parsing lite link: {e}")
404
+ continue
405
+
406
+ return results
407
+
408
+ except Exception as e:
409
+ logger.warning(f"DuckDuckGo Lite search failed: {e}")
410
+ return []
411
+
412
+ def _fallback_search(self, query: str, num_results: int) -> List[Dict]:
413
+ """Fallback search using alternative methods when DuckDuckGo fails"""
414
+ results = []
415
+
416
+ # Try Bing search as fallback
417
+ try:
418
+ bing_results = self._search_bing(query, num_results)
419
+ if bing_results:
420
+ results.extend(bing_results)
421
+ logger.info(f"Bing fallback found {len(bing_results)} results")
422
+ except Exception as e:
423
+ logger.warning(f"Bing fallback failed: {e}")
424
+
425
+ # Try Startpage search as fallback
426
+ try:
427
+ startpage_results = self._search_startpage(query, num_results)
428
+ if startpage_results:
429
+ results.extend(startpage_results)
430
+ logger.info(f"Startpage fallback found {len(startpage_results)} results")
431
+ except Exception as e:
432
+ logger.warning(f"Startpage fallback failed: {e}")
433
+
434
+ # Try Searx instances as fallback
435
+ try:
436
+ searx_results = self._search_searx(query, num_results)
437
+ if searx_results:
438
+ results.extend(searx_results)
439
+ logger.info(f"Searx fallback found {len(searx_results)} results")
440
+ except Exception as e:
441
+ logger.warning(f"Searx fallback failed: {e}")
442
+
443
+ return results
444
+
445
+ def _search_bing(self, query: str, num_results: int) -> List[Dict]:
446
+ """Search using Bing as fallback"""
447
+ try:
448
+ url = "https://www.bing.com/search"
449
+ params = {
450
+ 'q': query,
451
+ 'count': min(num_results, 50),
452
+ 'first': 1
453
+ }
454
+
455
+ headers = self.session.headers.copy()
456
+ headers.update({
457
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
458
+ 'Accept-Language': 'en-US,en;q=0.5',
459
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
460
+ })
461
+
462
+ response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
463
+ response.raise_for_status()
464
+
465
+ soup = BeautifulSoup(response.content, 'html.parser')
466
+ results = []
467
+
468
+ # Bing result selectors
469
+ selectors = [
470
+ 'h2 a',
471
+ '.b_title a',
472
+ '.b_algo a'
473
+ ]
474
+
475
+ for selector in selectors:
476
+ links = soup.select(selector)
477
+ if links:
478
+ logger.info(f"Bing found {len(links)} links with selector: {selector}")
479
+ break
480
+
481
+ for link in links[:num_results]:
482
+ try:
483
+ href = link.get('href')
484
+ if not href or href.startswith('#') or 'bing.com' in href:
485
+ continue
486
+
487
+ title = link.get_text(strip=True)
488
+ if title and href.startswith('http'):
489
+ results.append({
490
+ 'url': href,
491
+ 'title': title,
492
+ 'source': 'bing_fallback'
493
+ })
494
+ except Exception as e:
495
+ logger.debug(f"Error parsing Bing link: {e}")
496
+ continue
497
+
498
+ return results
499
+
500
+ except Exception as e:
501
+ logger.warning(f"Bing search failed: {e}")
502
+ return []
503
+
504
+ def _search_startpage(self, query: str, num_results: int) -> List[Dict]:
505
+ """Search using Startpage as fallback"""
506
+ try:
507
+ url = "https://www.startpage.com/sp/search"
508
+ params = {
509
+ 'query': query,
510
+ 'cat': 'web',
511
+ 'pl': 'opensearch'
512
+ }
513
+
514
+ headers = self.session.headers.copy()
515
+ headers.update({
516
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
517
+ 'Accept-Language': 'en-US,en;q=0.5'
518
+ })
519
+
520
+ response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
521
+ response.raise_for_status()
522
+
523
+ soup = BeautifulSoup(response.content, 'html.parser')
524
+ results = []
525
+
526
+ # Startpage result selectors
527
+ links = soup.select('a[href*="http"]:not([href*="startpage.com"])')
528
+
529
+ for link in links[:num_results]:
530
+ try:
531
+ href = link.get('href')
532
+ if not href or href.startswith('#') or 'startpage.com' in href:
533
+ continue
534
+
535
+ title = link.get_text(strip=True)
536
+ if title and href.startswith('http'):
537
+ results.append({
538
+ 'url': href,
539
+ 'title': title,
540
+ 'source': 'startpage_fallback'
541
+ })
542
+ except Exception as e:
543
+ logger.debug(f"Error parsing Startpage link: {e}")
544
+ continue
545
+
546
+ return results
547
+
548
+ except Exception as e:
549
+ logger.warning(f"Startpage search failed: {e}")
550
+ return []
551
+
552
+ def _search_searx(self, query: str, num_results: int) -> List[Dict]:
553
+ """Search using public Searx instances as fallback"""
554
+ searx_instances = [
555
+ "https://searx.be",
556
+ "https://searx.tiekoetter.com",
557
+ "https://searx.xyz"
558
+ ]
559
+
560
+ for instance in searx_instances:
561
+ try:
562
+ url = f"{instance}/search"
563
+ params = {
564
+ 'q': query,
565
+ 'format': 'json'
566
+ }
567
+
568
+ response = self.session.get(url, params=params, timeout=self.timeout)
569
+ response.raise_for_status()
570
+
571
+ data = response.json()
572
+ results = []
573
+
574
+ for result in data.get('results', [])[:num_results]:
575
+ try:
576
+ url = result.get('url', '')
577
+ title = result.get('title', '')
578
+ content = result.get('content', '')
579
+
580
+ if url and title and url.startswith('http'):
581
+ results.append({
582
+ 'url': url,
583
+ 'title': title,
584
+ 'content': content,
585
+ 'source': 'searx_fallback'
586
+ })
587
+ except Exception as e:
588
+ logger.debug(f"Error parsing Searx result: {e}")
589
+ continue
590
+
591
+ if results:
592
+ logger.info(f"Searx instance {instance} found {len(results)} results")
593
+ return results
594
+
595
+ except Exception as e:
596
+ logger.debug(f"Searx instance {instance} failed: {e}")
597
+ continue
598
+
599
+ return []
search/engines/multilingual.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict, Optional
5
+ import time
6
+ import re
7
+ from urllib.parse import urlparse, quote
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class MultilingualCookingEngine:
12
+ """Multilingual cooking search engine supporting English, Vietnamese, and Chinese sources"""
13
+
14
+ def __init__(self, timeout: int = 15):
15
+ self.session = requests.Session()
16
+ self.session.headers.update({
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
20
+ 'Accept-Encoding': 'gzip, deflate',
21
+ 'Connection': 'keep-alive',
22
+ })
23
+ self.timeout = timeout
24
+
25
+ # Comprehensive cooking sources by language
26
+ self.cooking_sources = {
27
+ 'en': {
28
+ # Major Cooking Sources
29
+ 'allrecipes': {
30
+ 'base_url': 'https://www.allrecipes.com',
31
+ 'search_url': 'https://www.allrecipes.com/search',
32
+ 'domains': ['allrecipes.com'],
33
+ 'selectors': ['a[href*="/recipe/"]', 'a[href*="/recipes/"]', '.search-result a']
34
+ },
35
+ 'food_network': {
36
+ 'base_url': 'https://www.foodnetwork.com',
37
+ 'search_url': 'https://www.foodnetwork.com/search',
38
+ 'domains': ['foodnetwork.com'],
39
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
40
+ },
41
+ 'epicurious': {
42
+ 'base_url': 'https://www.epicurious.com',
43
+ 'search_url': 'https://www.epicurious.com/search',
44
+ 'domains': ['epicurious.com'],
45
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
46
+ },
47
+ 'serious_eats': {
48
+ 'base_url': 'https://www.seriouseats.com',
49
+ 'search_url': 'https://www.seriouseats.com/search',
50
+ 'domains': ['seriouseats.com'],
51
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
52
+ },
53
+ 'bon_appetit': {
54
+ 'base_url': 'https://www.bonappetit.com',
55
+ 'search_url': 'https://www.bonappetit.com/search',
56
+ 'domains': ['bonappetit.com'],
57
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
58
+ },
59
+ 'taste_of_home': {
60
+ 'base_url': 'https://www.tasteofhome.com',
61
+ 'search_url': 'https://www.tasteofhome.com/search',
62
+ 'domains': ['tasteofhome.com'],
63
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
64
+ },
65
+ 'food_com': {
66
+ 'base_url': 'https://www.food.com',
67
+ 'search_url': 'https://www.food.com/search',
68
+ 'domains': ['food.com'],
69
+ 'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
70
+ }
71
+ },
72
+ 'vi': {
73
+ # Vietnamese Cooking Sources
74
+ 'mon_ngon_viet': {
75
+ 'base_url': 'https://monngonviet.com',
76
+ 'search_url': 'https://monngonviet.com/tim-kiem',
77
+ 'domains': ['monngonviet.com'],
78
+ 'selectors': ['a[href*="/cong-thuc/"]', 'a[href*="/mon-an/"]', '.search-result a']
79
+ },
80
+ 'day_phong_cach': {
81
+ 'base_url': 'https://dayphongcach.vn',
82
+ 'search_url': 'https://dayphongcach.vn/tim-kiem',
83
+ 'domains': ['dayphongcach.vn'],
84
+ 'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
85
+ },
86
+ 'am_thuc_viet': {
87
+ 'base_url': 'https://amthucviet.vn',
88
+ 'search_url': 'https://amthucviet.vn/tim-kiem',
89
+ 'domains': ['amthucviet.vn'],
90
+ 'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
91
+ }
92
+ },
93
+ 'zh': {
94
+ # Chinese Cooking Sources
95
+ 'xiachufang': {
96
+ 'base_url': 'https://www.xiachufang.com',
97
+ 'search_url': 'https://www.xiachufang.com/search',
98
+ 'domains': ['xiachufang.com'],
99
+ 'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
100
+ },
101
+ 'douguo': {
102
+ 'base_url': 'https://www.douguo.com',
103
+ 'search_url': 'https://www.douguo.com/search',
104
+ 'domains': ['douguo.com'],
105
+ 'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
106
+ },
107
+ 'meishij': {
108
+ 'base_url': 'https://www.meishij.net',
109
+ 'search_url': 'https://www.meishij.net/search',
110
+ 'domains': ['meishij.net'],
111
+ 'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
112
+ }
113
+ }
114
+ }
115
+
116
+ def search(self, query: str, num_results: int = 10, languages: List[str] = None) -> List[Dict]:
117
+ """Search across multiple languages and cooking sources"""
118
+ if languages is None:
119
+ languages = ['en', 'vi', 'zh']
120
+
121
+ all_results = []
122
+
123
+ for lang in languages:
124
+ if lang in self.cooking_sources:
125
+ lang_results = self._search_language_sources(query, lang, num_results // len(languages))
126
+ all_results.extend(lang_results)
127
+ time.sleep(0.5) # Rate limiting between languages
128
+
129
+ return all_results[:num_results]
130
+
131
+ def _search_language_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
132
+ """Search sources for a specific language"""
133
+ results = []
134
+ sources = self.cooking_sources.get(language, {})
135
+
136
+ for source_name, source_config in sources.items():
137
+ if len(results) >= num_results:
138
+ break
139
+
140
+ source_results = self._search_source(query, source_name, source_config, language)
141
+ results.extend(source_results)
142
+ time.sleep(0.3) # Rate limiting
143
+
144
+ return results
145
+
146
+ def _search_source(self, query: str, source_name: str, source_config: Dict, language: str) -> List[Dict]:
147
+ """Search a specific cooking source"""
148
+ try:
149
+ search_url = source_config.get('search_url')
150
+ if not search_url:
151
+ return []
152
+
153
+ params = {
154
+ 'q': query,
155
+ 'query': query,
156
+ 'search': query,
157
+ 'keyword': query
158
+ }
159
+
160
+ response = self.session.get(search_url, params=params, timeout=self.timeout)
161
+ response.raise_for_status()
162
+
163
+ soup = BeautifulSoup(response.content, 'html.parser')
164
+ results = []
165
+
166
+ # Source-specific selectors
167
+ selectors = source_config.get('selectors', ['a[href*="http"]'])
168
+
169
+ for selector in selectors:
170
+ links = soup.select(selector)
171
+ if links:
172
+ logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
173
+ break
174
+
175
+ for link in links[:3]: # Limit per source
176
+ try:
177
+ href = link.get('href')
178
+ if not href:
179
+ continue
180
+
181
+ # Make absolute URL
182
+ if href.startswith('/'):
183
+ href = source_config['base_url'] + href
184
+
185
+ title = link.get_text(strip=True)
186
+ if title and href.startswith('http'):
187
+ results.append({
188
+ 'url': href,
189
+ 'title': title,
190
+ 'source': source_name,
191
+ 'domain': source_config['domains'][0],
192
+ 'language': language
193
+ })
194
+ except Exception as e:
195
+ logger.debug(f"Error parsing {source_name} link: {e}")
196
+ continue
197
+
198
+ return results
199
+
200
+ except Exception as e:
201
+ logger.warning(f"Cooking source {source_name} ({language}) search failed: {e}")
202
+ return []
203
+
204
+ def search_by_language(self, query: str, language: str, num_results: int = 10) -> List[Dict]:
205
+ """Search sources for a specific language only"""
206
+ if language not in self.cooking_sources:
207
+ logger.warning(f"Language {language} not supported")
208
+ return []
209
+
210
+ return self._search_language_sources(query, language, num_results)
211
+
212
+ def _get_fallback_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
213
+ """Get fallback cooking sources when direct search fails"""
214
+ fallback_sources = {
215
+ 'en': [
216
+ {
217
+ 'url': 'https://www.allrecipes.com/recipes',
218
+ 'title': f'AllRecipes: {query}',
219
+ 'source': 'allrecipes_fallback',
220
+ 'language': 'en',
221
+ 'domain': 'allrecipes.com'
222
+ },
223
+ {
224
+ 'url': 'https://www.foodnetwork.com/recipes',
225
+ 'title': f'Food Network: {query}',
226
+ 'source': 'foodnetwork_fallback',
227
+ 'language': 'en',
228
+ 'domain': 'foodnetwork.com'
229
+ },
230
+ {
231
+ 'url': 'https://www.epicurious.com/recipes-menus',
232
+ 'title': f'Epicurious: {query}',
233
+ 'source': 'epicurious_fallback',
234
+ 'language': 'en',
235
+ 'domain': 'epicurious.com'
236
+ }
237
+ ],
238
+ 'vi': [
239
+ {
240
+ 'url': 'https://monngonviet.com/cong-thuc',
241
+ 'title': f'Món Ngon Việt: {query}',
242
+ 'source': 'monngonviet_fallback',
243
+ 'language': 'vi',
244
+ 'domain': 'monngonviet.com'
245
+ },
246
+ {
247
+ 'url': 'https://dayphongcach.vn/mon-an',
248
+ 'title': f'Dạy Phong Cách: {query}',
249
+ 'source': 'dayphongcach_fallback',
250
+ 'language': 'vi',
251
+ 'domain': 'dayphongcach.vn'
252
+ }
253
+ ],
254
+ 'zh': [
255
+ {
256
+ 'url': 'https://www.xiachufang.com/recipe',
257
+ 'title': f'下厨房: {query}',
258
+ 'source': 'xiachufang_fallback',
259
+ 'language': 'zh',
260
+ 'domain': 'xiachufang.com'
261
+ },
262
+ {
263
+ 'url': 'https://www.douguo.com/recipe',
264
+ 'title': f'豆果: {query}',
265
+ 'source': 'douguo_fallback',
266
+ 'language': 'zh',
267
+ 'domain': 'douguo.com'
268
+ }
269
+ ]
270
+ }
271
+
272
+ return fallback_sources.get(language, [])[:num_results]
search/engines/video.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+ import re
7
+ from urllib.parse import urlparse, quote
8
+ from models.reranker import MedicalReranker
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class VideoSearchEngine:
13
+ """Search engine for medical videos across multiple platforms"""
14
+
15
+ def __init__(self, timeout: int = 15):
16
+ self.session = requests.Session()
17
+ self.session.headers.update({
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
19
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
20
+ 'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
21
+ 'Accept-Encoding': 'gzip, deflate',
22
+ 'Connection': 'keep-alive',
23
+ })
24
+ self.timeout = timeout
25
+ self.reranker = MedicalReranker()
26
+
27
+ # Video platforms by language
28
+ self.video_platforms = {
29
+ 'en': [
30
+ {
31
+ 'name': 'youtube',
32
+ 'search_url': 'https://www.youtube.com/results',
33
+ 'params': {'search_query': ''},
34
+ 'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
35
+ 'base_url': 'https://www.youtube.com'
36
+ },
37
+ {
38
+ 'name': 'medscape_videos',
39
+ 'search_url': 'https://www.medscape.com/search',
40
+ 'params': {'q': ''},
41
+ 'selectors': ['a[href*="/video/"]', 'a[href*="/viewarticle/"]'],
42
+ 'base_url': 'https://www.medscape.com'
43
+ }
44
+ ],
45
+ 'vi': [
46
+ {
47
+ 'name': 'youtube_vi',
48
+ 'search_url': 'https://www.youtube.com/results',
49
+ 'params': {'search_query': ''},
50
+ 'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
51
+ 'base_url': 'https://www.youtube.com'
52
+ },
53
+ {
54
+ 'name': 'vinmec_videos',
55
+ 'search_url': 'https://www.vinmec.com/vi/tim-kiem',
56
+ 'params': {'q': ''},
57
+ 'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
58
+ 'base_url': 'https://www.vinmec.com'
59
+ }
60
+ ],
61
+ 'zh': [
62
+ {
63
+ 'name': 'youtube_zh',
64
+ 'search_url': 'https://www.youtube.com/results',
65
+ 'params': {'search_query': ''},
66
+ 'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
67
+ 'base_url': 'https://www.youtube.com'
68
+ },
69
+ {
70
+ 'name': 'haodf_videos',
71
+ 'search_url': 'https://www.haodf.com/search',
72
+ 'params': {'q': ''},
73
+ 'selectors': ['a[href*="/video/"]', 'a[href*="/jibing/"]'],
74
+ 'base_url': 'https://www.haodf.com'
75
+ }
76
+ ]
77
+ }
78
+
79
+ def _normalize_query(self, q: str) -> str:
80
+ if not q:
81
+ return ""
82
+ q = q.strip()
83
+ q = re.sub(r"^(en|vi|zh)\s*:\s*", "", q, flags=re.IGNORECASE)
84
+ # Remove bullet points and special characters
85
+ q = re.sub(r'[•·▪▫‣⁃]', ' ', q)
86
+ q = re.sub(r'[^\w\s\-\.]', ' ', q)
87
+ q = re.sub(r"\s+", " ", q)
88
+ return q.strip()
89
+
90
+ def _is_valid_medical_video(self, result: Dict, query: str) -> bool:
91
+ """Check if video is medically relevant and has valid URL"""
92
+ url = result.get('url', '')
93
+ title = result.get('title', '')
94
+
95
+ # Skip generic YouTube search result pages
96
+ if 'results?search_query=' in url:
97
+ return False
98
+
99
+ # Skip non-YouTube URLs that aren't medical platforms
100
+ if 'youtube.com' not in url and not any(med in url for med in ['medscape.com', 'vinmec.com', 'haodf.com']):
101
+ return False
102
+
103
+ # Check if title contains medical keywords or query terms
104
+ title_lower = title.lower()
105
+ query_lower = query.lower()
106
+
107
+ medical_keywords = [
108
+ 'medical', 'health', 'doctor', 'treatment', 'diagnosis',
109
+ 'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
110
+ 'disease', 'condition', 'healthcare', 'physician'
111
+ ]
112
+
113
+ # Must contain medical keywords or query terms
114
+ has_medical = any(keyword in title_lower for keyword in medical_keywords)
115
+ has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
116
+
117
+ return has_medical or has_query
118
+
119
+ def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
120
+ """Search platform with retry logic and better error handling"""
121
+ for attempt in range(max_retries):
122
+ try:
123
+ return self._search_platform(query, platform, num_results)
124
+ except Exception as e:
125
+ logger.warning(f"Attempt {attempt + 1} failed for {platform['name']}: {e}")
126
+ if attempt < max_retries - 1:
127
+ time.sleep(1) # Wait before retry
128
+ else:
129
+ logger.error(f"All attempts failed for {platform['name']}")
130
+ return []
131
+
132
+ def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
133
+ """Search for medical videos across platforms with deduplication and medical filtering"""
134
+ query = self._normalize_query(query)
135
+ logger.info(f"Searching for medical videos: {query} (language: {language})")
136
+
137
+ results = []
138
+ seen_urls = set() # Track URLs to avoid duplicates
139
+ seen_video_ids = set() # Track video IDs to avoid duplicates
140
+ platforms = self.video_platforms.get(language, self.video_platforms['en'])
141
+
142
+ # Try platforms in order of reliability
143
+ for platform in platforms:
144
+ if len(results) >= num_results:
145
+ break
146
+
147
+ try:
148
+ # Add timeout and retry logic
149
+ platform_results = self._search_platform_with_retry(query, platform, num_results * 3)
150
+
151
+ if not platform_results:
152
+ logger.warning(f"No results from {platform['name']}")
153
+ continue
154
+
155
+ # Filter out duplicates and non-medical content
156
+ for result in platform_results:
157
+ url = result.get('url', '')
158
+ video_id = self._extract_video_id(url)
159
+
160
+ # Skip if URL or video ID already seen
161
+ if url in seen_urls or (video_id and video_id in seen_video_ids):
162
+ continue
163
+
164
+ # Check if it's a valid medical video (less strict for more results)
165
+ if self._is_valid_medical_video(result, query):
166
+ seen_urls.add(url)
167
+ if video_id:
168
+ seen_video_ids.add(video_id)
169
+
170
+ # Normalize YouTube URLs
171
+ if video_id and 'youtube.com' in url:
172
+ result['url'] = f"https://www.youtube.com/watch?v={video_id}"
173
+ result['video_id'] = video_id
174
+
175
+ results.append(result)
176
+ if len(results) >= num_results:
177
+ break
178
+
179
+ time.sleep(0.5) # Rate limiting
180
+ except Exception as e:
181
+ logger.warning(f"Video search failed for {platform['name']}: {e}")
182
+ continue
183
+
184
+ # Add fallback video sources if needed
185
+ if len(results) < num_results:
186
+ # Try resilient YouTube via Invidious API
187
+ try:
188
+ resilient = self._search_youtube_invidious(query, language, num_results - len(results))
189
+ for result in resilient:
190
+ url = result.get('url', '')
191
+ video_id = result.get('video_id', '')
192
+
193
+ if (url not in seen_urls and
194
+ video_id not in seen_video_ids and
195
+ self._is_valid_medical_video(result, query)):
196
+ seen_urls.add(url)
197
+ if video_id:
198
+ seen_video_ids.add(video_id)
199
+ results.append(result)
200
+ if len(results) >= num_results:
201
+ break
202
+ except Exception as e:
203
+ logger.warning(f"Invidious fallback failed: {e}")
204
+
205
+ # If still no results, try generic video search fallback
206
+ if len(results) < num_results:
207
+ try:
208
+ fallback_results = self._get_fallback_videos(query, language, num_results - len(results))
209
+ for result in fallback_results:
210
+ if result['url'] not in seen_urls:
211
+ seen_urls.add(result['url'])
212
+ results.append(result)
213
+ if len(results) >= num_results:
214
+ break
215
+ logger.info(f"Added {len(fallback_results)} fallback video results")
216
+ except Exception as e:
217
+ logger.warning(f"Fallback video search failed: {e}")
218
+
219
+ # Use reranker to improve quality and relevance
220
+ if results:
221
+ reranked_results = self.reranker.filter_youtube_results(results, query)
222
+ logger.info(f"Reranked {len(results)} video results to {len(reranked_results)} high-quality results")
223
+ return reranked_results[:num_results]
224
+
225
+ logger.info(f"Found {len(results)} medical video results")
226
+ return results[:num_results]
227
+
228
+ def _search_platform(self, query: str, platform: Dict, num_results: int) -> List[Dict]:
229
+ """Search a specific video platform with improved error handling"""
230
+ try:
231
+ search_url = platform['search_url']
232
+ params = platform['params'].copy()
233
+
234
+ # Set search query parameter
235
+ for param_name in params.keys():
236
+ params[param_name] = query
237
+
238
+ # Add headers to avoid blocking
239
+ headers = self.session.headers.copy()
240
+ headers.update({
241
+ 'Referer': 'https://www.google.com/',
242
+ 'Cache-Control': 'no-cache',
243
+ })
244
+
245
+ # Try with shorter timeout first
246
+ response = self.session.get(search_url, params=params, headers=headers, timeout=10)
247
+
248
+ # Check for common error responses
249
+ if response.status_code == 404:
250
+ logger.warning(f"Platform {platform['name']} returned 404 - endpoint may have changed")
251
+ return []
252
+ elif response.status_code == 403:
253
+ logger.warning(f"Platform {platform['name']} returned 403 - may be blocking requests")
254
+ return []
255
+ elif response.status_code >= 400:
256
+ logger.warning(f"Platform {platform['name']} returned {response.status_code}")
257
+ return []
258
+
259
+ response.raise_for_status()
260
+
261
+ soup = BeautifulSoup(response.content, 'html.parser')
262
+ results = []
263
+
264
+ # Try platform-specific selectors
265
+ selectors = platform.get('selectors', ['a[href*="video"]', 'a[href*="watch"]'])
266
+
267
+ links = []
268
+ for selector in selectors:
269
+ links = soup.select(selector)
270
+ if links:
271
+ logger.info(f"{platform['name']} found {len(links)} video links with selector: {selector}")
272
+ break
273
+
274
+ # If no links found, try generic selectors
275
+ if not links:
276
+ generic_selectors = ['a[href*="http"]', 'a[href*="www"]']
277
+ for selector in generic_selectors:
278
+ links = soup.select(selector)
279
+ if links:
280
+ logger.info(f"{platform['name']} found {len(links)} generic links with selector: {selector}")
281
+ break
282
+
283
+ for link in links[:num_results]:
284
+ try:
285
+ href = link.get('href')
286
+ if not href:
287
+ continue
288
+
289
+ # Make absolute URL
290
+ if href.startswith('/'):
291
+ href = platform['base_url'] + href
292
+
293
+ # Skip if not a valid URL
294
+ if not href.startswith('http'):
295
+ continue
296
+
297
+ title = link.get_text(strip=True) or platform['name']
298
+ if title and href:
299
+ results.append({
300
+ 'url': href,
301
+ 'title': title,
302
+ 'platform': platform['name'],
303
+ 'type': 'video',
304
+ 'source': platform['name']
305
+ })
306
+ except Exception as e:
307
+ logger.debug(f"Error parsing {platform['name']} link: {e}")
308
+ continue
309
+
310
+ return results
311
+
312
+ except requests.exceptions.Timeout:
313
+ logger.warning(f"Platform {platform['name']} search timed out")
314
+ return []
315
+ except requests.exceptions.ConnectionError:
316
+ logger.warning(f"Platform {platform['name']} connection failed - network issue")
317
+ return []
318
+ except Exception as e:
319
+ logger.warning(f"Platform {platform['name']} search failed: {e}")
320
+ return []
321
+
322
+ def _search_youtube_invidious(self, query: str, language: str, needed: int) -> List[Dict]:
323
+ """Search YouTube via public Invidious instances (no API key)."""
324
+ if needed <= 0:
325
+ return []
326
+ instances = [
327
+ "https://yewtu.be",
328
+ "https://invidious.flokinet.to",
329
+ "https://vid.puffyan.us",
330
+ "https://iv.ggtyler.dev"
331
+ ]
332
+ out: List[Dict] = []
333
+ q = quote(query)
334
+ for base in instances:
335
+ if len(out) >= needed:
336
+ break
337
+ try:
338
+ url = f"{base}/api/v1/search?q={q}&region={'VN' if language=='vi' else 'US'}&fields=title,videoId,author&type=video"
339
+ r = self.session.get(url, timeout=6)
340
+ r.raise_for_status()
341
+ data = r.json()
342
+ for item in data:
343
+ if len(out) >= needed:
344
+ break
345
+ vid = item.get("videoId")
346
+ title = (item.get("title") or "").strip()
347
+ if not vid or not title:
348
+ continue
349
+ out.append({
350
+ 'url': f"https://www.youtube.com/watch?v={vid}",
351
+ 'title': title,
352
+ 'thumbnail': f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg",
353
+ 'platform': 'youtube',
354
+ 'source': 'youtube',
355
+ 'type': 'video',
356
+ 'language': language
357
+ })
358
+ except Exception as e:
359
+ logger.debug(f"Invidious {base} failed: {e}")
360
+ continue
361
+ return out
362
+
363
+ def _get_fallback_videos(self, query: str, language: str, num_results: int) -> List[Dict]:
364
+ """Get fallback video sources when direct search fails"""
365
+ fallback_videos = {
366
+ 'en': [
367
+ {
368
+ 'url': 'https://www.youtube.com/results?search_query=medical+' + quote(query),
369
+ 'title': f'Medical Videos: {query}',
370
+ 'platform': 'youtube_fallback',
371
+ 'type': 'video',
372
+ 'source': 'youtube'
373
+ },
374
+ {
375
+ 'url': 'https://www.medscape.com/search?q=' + quote(query),
376
+ 'title': f'Medscape Videos: {query}',
377
+ 'platform': 'medscape_fallback',
378
+ 'type': 'video',
379
+ 'source': 'medscape'
380
+ }
381
+ ],
382
+ 'vi': [
383
+ {
384
+ 'url': 'https://www.youtube.com/results?search_query=y+tế+' + quote(query),
385
+ 'title': f'Video Y Tế: {query}',
386
+ 'platform': 'youtube_vi_fallback',
387
+ 'type': 'video',
388
+ 'source': 'youtube'
389
+ },
390
+ {
391
+ 'url': 'https://www.vinmec.com/vi/suc-khoe',
392
+ 'title': f'Vinmec Videos: {query}',
393
+ 'platform': 'vinmec_fallback',
394
+ 'type': 'video',
395
+ 'source': 'vinmec'
396
+ }
397
+ ],
398
+ 'zh': [
399
+ {
400
+ 'url': 'https://www.youtube.com/results?search_query=医疗+' + quote(query),
401
+ 'title': f'医疗视频: {query}',
402
+ 'platform': 'youtube_zh_fallback',
403
+ 'type': 'video',
404
+ 'source': 'youtube'
405
+ },
406
+ {
407
+ 'url': 'https://www.haodf.com/jibing',
408
+ 'title': f'好大夫视频: {query}',
409
+ 'platform': 'haodf_fallback',
410
+ 'type': 'video',
411
+ 'source': 'haodf'
412
+ }
413
+ ]
414
+ }
415
+
416
+ return fallback_videos.get(language, fallback_videos['en'])[:num_results]
417
+
418
+ def _extract_video_id(self, url: str) -> str:
419
+ """Extract YouTube video ID from URL"""
420
+ patterns = [
421
+ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
422
+ r'(?:embed\/)([0-9A-Za-z_-]{11})',
423
+ r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
424
+ ]
425
+
426
+ for pattern in patterns:
427
+ match = re.search(pattern, url)
428
+ if match:
429
+ return match.group(1)
430
+
431
+ return None
432
+
search/extractors/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .content import ContentExtractor
2
+
3
+ __all__ = ['ContentExtractor']
search/extractors/content.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import Dict, Optional
5
+ import re
6
+ from urllib.parse import urlparse
7
+ import time
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class ContentExtractor:
12
+ """Extract and clean content from web pages"""
13
+
14
+ def __init__(self, timeout: int = 15):
15
+ self.session = requests.Session()
16
+ self.session.headers.update({
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5',
20
+ 'Accept-Encoding': 'gzip, deflate',
21
+ 'Connection': 'keep-alive',
22
+ })
23
+ self.timeout = timeout
24
+
25
+ # Medical content indicators
26
+ self.medical_indicators = [
27
+ 'symptom', 'treatment', 'diagnosis', 'medicine', 'medication',
28
+ 'therapy', 'condition', 'disease', 'health', 'medical',
29
+ 'doctor', 'physician', 'patient', 'clinical', 'study'
30
+ ]
31
+
32
+ def extract(self, url: str, max_length: int = 2000) -> Optional[str]:
33
+ """Extract content from a URL with medical focus"""
34
+ try:
35
+ response = self.session.get(url, timeout=self.timeout)
36
+ response.raise_for_status()
37
+
38
+ soup = BeautifulSoup(response.content, 'html.parser')
39
+
40
+ # Remove unwanted elements
41
+ self._remove_unwanted_elements(soup)
42
+
43
+ # Extract main content
44
+ content = self._extract_main_content(soup)
45
+
46
+ if not content:
47
+ return None
48
+
49
+ # Clean and process content
50
+ cleaned_content = self._clean_content(content)
51
+
52
+ # Focus on medical content if possible
53
+ medical_content = self._extract_medical_content(cleaned_content)
54
+
55
+ # Truncate to max length
56
+ final_content = self._truncate_content(medical_content or cleaned_content, max_length)
57
+
58
+ return final_content if final_content else None
59
+
60
+ except Exception as e:
61
+ logger.warning(f"Content extraction failed for {url}: {e}")
62
+ return None
63
+
64
+ def _remove_unwanted_elements(self, soup: BeautifulSoup):
65
+ """Remove unwanted HTML elements"""
66
+ unwanted_tags = [
67
+ 'script', 'style', 'nav', 'header', 'footer', 'aside',
68
+ 'advertisement', 'ads', 'sidebar', 'menu', 'navigation',
69
+ 'social', 'share', 'comment', 'comments', 'related',
70
+ 'cookie', 'privacy', 'terms', 'disclaimer'
71
+ ]
72
+
73
+ for tag in unwanted_tags:
74
+ for element in soup.find_all(tag):
75
+ element.decompose()
76
+
77
+ # Remove elements with unwanted classes/ids
78
+ unwanted_selectors = [
79
+ '[class*="ad"]', '[class*="advertisement"]', '[class*="sidebar"]',
80
+ '[class*="menu"]', '[class*="nav"]', '[class*="social"]',
81
+ '[class*="share"]', '[class*="comment"]', '[class*="related"]',
82
+ '[id*="ad"]', '[id*="sidebar"]', '[id*="menu"]', '[id*="nav"]'
83
+ ]
84
+
85
+ for selector in unwanted_selectors:
86
+ for element in soup.select(selector):
87
+ element.decompose()
88
+
89
+ def _extract_main_content(self, soup: BeautifulSoup) -> str:
90
+ """Extract main content from the page"""
91
+ # Priority order for content extraction
92
+ content_selectors = [
93
+ 'article',
94
+ 'main',
95
+ '[role="main"]',
96
+ '.content',
97
+ '.main-content',
98
+ '.article-content',
99
+ '.post-content',
100
+ '.entry-content',
101
+ '.page-content',
102
+ 'body'
103
+ ]
104
+
105
+ for selector in content_selectors:
106
+ elements = soup.select(selector)
107
+ if elements:
108
+ # Get the largest content element
109
+ largest_element = max(elements, key=lambda x: len(x.get_text()))
110
+ content = largest_element.get_text(separator=' ', strip=True)
111
+ if len(content) > 100: # Minimum content length
112
+ return content
113
+
114
+ # Fallback: get all text
115
+ return soup.get_text(separator=' ', strip=True)
116
+
117
+ def _clean_content(self, content: str) -> str:
118
+ """Clean and normalize content"""
119
+ if not content:
120
+ return ""
121
+
122
+ # Remove excessive whitespace
123
+ content = re.sub(r'\s+', ' ', content)
124
+
125
+ # Remove common web artifacts
126
+ artifacts = [
127
+ r'Cookie\s+Policy',
128
+ r'Privacy\s+Policy',
129
+ r'Terms\s+of\s+Service',
130
+ r'Subscribe\s+to\s+our\s+newsletter',
131
+ r'Follow\s+us\s+on',
132
+ r'Share\s+this\s+article',
133
+ r'Related\s+articles',
134
+ r'Advertisement',
135
+ r'Ad\s+content'
136
+ ]
137
+
138
+ for artifact in artifacts:
139
+ content = re.sub(artifact, '', content, flags=re.IGNORECASE)
140
+
141
+ # Remove excessive punctuation
142
+ content = re.sub(r'[.]{3,}', '...', content)
143
+ content = re.sub(r'[!]{2,}', '!', content)
144
+ content = re.sub(r'[?]{2,}', '?', content)
145
+
146
+ return content.strip()
147
+
148
+ def _extract_medical_content(self, content: str) -> Optional[str]:
149
+ """Extract medical-focused content from the text"""
150
+ if not content:
151
+ return None
152
+
153
+ # Split content into sentences
154
+ sentences = re.split(r'[.!?]+', content)
155
+ medical_sentences = []
156
+
157
+ for sentence in sentences:
158
+ sentence = sentence.strip()
159
+ if len(sentence) < 20: # Skip very short sentences
160
+ continue
161
+
162
+ # Check if sentence contains medical indicators
163
+ sentence_lower = sentence.lower()
164
+ if any(indicator in sentence_lower for indicator in self.medical_indicators):
165
+ medical_sentences.append(sentence)
166
+
167
+ if medical_sentences:
168
+ # Return medical sentences, prioritizing longer ones
169
+ medical_sentences.sort(key=len, reverse=True)
170
+ return '. '.join(medical_sentences[:10]) + '.'
171
+
172
+ return None
173
+
174
+ def _truncate_content(self, content: str, max_length: int) -> str:
175
+ """Truncate content to max length while preserving sentences"""
176
+ if len(content) <= max_length:
177
+ return content
178
+
179
+ # Try to truncate at sentence boundary
180
+ truncated = content[:max_length]
181
+ last_period = truncated.rfind('.')
182
+ last_exclamation = truncated.rfind('!')
183
+ last_question = truncated.rfind('?')
184
+
185
+ last_sentence_end = max(last_period, last_exclamation, last_question)
186
+
187
+ if last_sentence_end > max_length * 0.7: # If we can find a good break point
188
+ return content[:last_sentence_end + 1]
189
+
190
+ # Fallback: truncate at word boundary
191
+ words = truncated.split()
192
+ if len(words) > 1:
193
+ return ' '.join(words[:-1]) + '...'
194
+
195
+ return truncated + '...'
196
+
197
+ def extract_multiple(self, urls: list, max_length: int = 2000) -> Dict[str, str]:
198
+ """Extract content from multiple URLs"""
199
+ results = {}
200
+
201
+ for url in urls:
202
+ try:
203
+ content = self.extract(url, max_length)
204
+ if content:
205
+ results[url] = content
206
+ time.sleep(0.5) # Be respectful to servers
207
+ except Exception as e:
208
+ logger.warning(f"Failed to extract content from {url}: {e}")
209
+ continue
210
+
211
+ return results
search/processors/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .medical import MedicalSearchProcessor
2
+ from .language import LanguageProcessor
3
+ from .sources import SourceAggregator
4
+ from .enhanced import EnhancedContentProcessor
5
+
6
+ __all__ = ['MedicalSearchProcessor', 'LanguageProcessor', 'SourceAggregator', 'EnhancedContentProcessor']
search/processors/cooking.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ from models.summarizer import summarizer
4
+ import re
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class CookingSearchProcessor:
9
+ """Process and enhance cooking search results"""
10
+
11
+ def __init__(self):
12
+ self.cooking_keywords = [
13
+ 'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
14
+ 'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
15
+ 'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
16
+ 'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
17
+ 'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
18
+ 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
19
+ 'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
20
+ 'substitution', 'alternative', 'variation', 'modification', 'adaptation',
21
+ 'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
22
+ ]
23
+
24
+ def process_results(self, results: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
25
+ """Process search results and create comprehensive cooking summary"""
26
+ if not results:
27
+ return "", {}
28
+
29
+ # Filter and rank results by cooking relevance
30
+ relevant_results = self._filter_cooking_results(results, user_query)
31
+
32
+ if not relevant_results:
33
+ logger.warning("No cooking-relevant results found")
34
+ return "", {}
35
+
36
+ # Extract and summarize content
37
+ summarized_results = self._summarize_results(relevant_results, user_query)
38
+
39
+ # Create comprehensive summary
40
+ combined_summary = self._create_combined_summary(summarized_results, user_query)
41
+
42
+ # Create URL mapping for citations
43
+ url_mapping = self._create_url_mapping(relevant_results)
44
+
45
+ return combined_summary, url_mapping
46
+
47
+ def _filter_cooking_results(self, results: List[Dict], user_query: str) -> List[Dict]:
48
+ """Filter results by cooking relevance"""
49
+ relevant_results = []
50
+
51
+ for result in results:
52
+ relevance_score = self._calculate_relevance_score(result, user_query)
53
+
54
+ if relevance_score > 0.3: # Threshold for cooking relevance
55
+ result['relevance_score'] = relevance_score
56
+ relevant_results.append(result)
57
+
58
+ # Sort by relevance score
59
+ relevant_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
60
+
61
+ # Limit to top results
62
+ return relevant_results[:10]
63
+
64
+ def _calculate_relevance_score(self, result: Dict, user_query: str) -> float:
65
+ """Calculate cooking relevance score for a result"""
66
+ score = 0.0
67
+
68
+ # Check title relevance
69
+ title = result.get('title', '').lower()
70
+ query_lower = user_query.lower()
71
+
72
+ # Direct query match in title
73
+ if any(word in title for word in query_lower.split()):
74
+ score += 0.4
75
+
76
+ # Cooking keyword match in title
77
+ cooking_matches = sum(1 for keyword in self.cooking_keywords if keyword in title)
78
+ score += min(cooking_matches * 0.1, 0.3)
79
+
80
+ # Domain credibility for cooking sources
81
+ url = result.get('url', '').lower()
82
+ credible_domains = [
83
+ 'allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com',
84
+ 'bonappetit.com', 'cooking.nytimes.com', 'tasteofhome.com', 'food.com',
85
+ 'bbcgoodfood.com', 'jamieoliver.com', 'gordonramsay.com', 'marthastewart.com',
86
+ 'kingarthurbaking.com', 'sallysbakingaddiction.com', 'smittenkitchen.com'
87
+ ]
88
+
89
+ if any(domain in url for domain in credible_domains):
90
+ score += 0.3
91
+
92
+ # Source type bonus for cooking
93
+ source = result.get('source', '')
94
+ if 'cooking' in source or 'recipe' in source or any(domain in source for domain in credible_domains):
95
+ score += 0.2
96
+
97
+ return min(score, 1.0)
98
+
99
+ def _summarize_results(self, results: List[Dict], user_query: str) -> List[Dict]:
100
+ """Summarize content from search results"""
101
+ summarized_results = []
102
+
103
+ for i, result in enumerate(results):
104
+ try:
105
+ content = result.get('content', '')
106
+ if not content:
107
+ continue
108
+
109
+ # Create focused summary
110
+ summary = summarizer.summarize_for_query(content, user_query, max_length=300)
111
+
112
+ if summary:
113
+ summarized_results.append({
114
+ 'id': i + 1,
115
+ 'url': result['url'],
116
+ 'title': result['title'],
117
+ 'summary': summary,
118
+ 'relevance_score': result.get('relevance_score', 0)
119
+ })
120
+
121
+ except Exception as e:
122
+ logger.warning(f"Failed to summarize result {i}: {e}")
123
+ continue
124
+
125
+ return summarized_results
126
+
127
+ def _create_combined_summary(self, summarized_results: List[Dict], user_query: str) -> str:
128
+ """Create a comprehensive summary from all results with proper source attribution"""
129
+ if not summarized_results:
130
+ return ""
131
+
132
+ logger.info(f"Creating combined summary from {len(summarized_results)} results")
133
+
134
+ # Group by topic/similarity
135
+ topic_groups = self._group_by_topic(summarized_results)
136
+
137
+ summary_parts = []
138
+ citation_counter = 1
139
+
140
+ for topic, results in topic_groups.items():
141
+ if not results:
142
+ continue
143
+
144
+ logger.info(f"Processing {topic} topic with {len(results)} results")
145
+
146
+ # Create topic summary with source attribution
147
+ topic_summary = self._create_topic_summary(topic, results, user_query, citation_counter)
148
+ if topic_summary:
149
+ summary_parts.append(topic_summary)
150
+ # Update citation counter for next topic
151
+ citation_counter += len([r for r in results if r.get('summary')])
152
+
153
+ # Combine all parts
154
+ combined_summary = "\n\n".join(summary_parts)
155
+
156
+ # Don't over-summarize - keep source attribution intact
157
+ if len(combined_summary) > 2000:
158
+ # Only truncate if absolutely necessary, but preserve structure
159
+ lines = combined_summary.split('\n')
160
+ truncated_lines = []
161
+ current_length = 0
162
+
163
+ for line in lines:
164
+ if current_length + len(line) > 2000:
165
+ break
166
+ truncated_lines.append(line)
167
+ current_length += len(line)
168
+
169
+ combined_summary = '\n'.join(truncated_lines)
170
+ if len(truncated_lines) < len(lines):
171
+ combined_summary += "\n\n*[Additional information available from multiple sources]*"
172
+
173
+ logger.info(f"Final combined summary length: {len(combined_summary)} characters")
174
+ return combined_summary
175
+
176
+ def _group_by_topic(self, results: List[Dict]) -> Dict[str, List[Dict]]:
177
+ """Group results by cooking topic"""
178
+ topics = {
179
+ 'recipes': [],
180
+ 'techniques': [],
181
+ 'ingredients': [],
182
+ 'general': []
183
+ }
184
+
185
+ for result in results:
186
+ title_lower = result['title'].lower()
187
+ summary_lower = result.get('summary', '').lower()
188
+ content_lower = f"{title_lower} {summary_lower}"
189
+
190
+ # Categorize by content
191
+ if any(word in content_lower for word in ['recipe', 'ingredients', 'instructions', 'steps']):
192
+ topics['recipes'].append(result)
193
+ elif any(word in content_lower for word in ['technique', 'method', 'how to', 'cooking']):
194
+ topics['techniques'].append(result)
195
+ elif any(word in content_lower for word in ['ingredients', 'substitution', 'alternative', 'variation']):
196
+ topics['ingredients'].append(result)
197
+ else:
198
+ topics['general'].append(result)
199
+
200
+ return topics
201
+
202
+ def _create_topic_summary(self, topic: str, results: List[Dict], user_query: str, citation_start: int = 1) -> str:
203
+ """Create summary for a specific topic with source attribution"""
204
+ if not results:
205
+ return ""
206
+
207
+ # Add topic header
208
+ topic_headers = {
209
+ 'recipes': "**Recipes and Instructions:**",
210
+ 'techniques': "**Cooking Techniques:**",
211
+ 'ingredients': "**Ingredients and Substitutions:**",
212
+ 'general': "**General Information:**"
213
+ }
214
+
215
+ header = topic_headers.get(topic, "**Information:**")
216
+ summary_parts = [header]
217
+
218
+ # Process each result individually to maintain source attribution
219
+ for i, result in enumerate(results[:3]): # Limit to top 3 per topic
220
+ summary = result.get('summary', '')
221
+ if not summary:
222
+ continue
223
+
224
+ # Extract domain from URL for source attribution
225
+ url = result.get('url', '')
226
+ domain = self._extract_domain(url)
227
+
228
+ # Use proper citation number
229
+ citation_num = citation_start + i
230
+
231
+ # Add source attribution
232
+ summary_with_source = f"* {summary} <#{citation_num}>"
233
+ summary_parts.append(summary_with_source)
234
+
235
+ return "\n".join(summary_parts)
236
+
237
+ def _extract_domain(self, url: str) -> str:
238
+ """Extract domain name from URL"""
239
+ try:
240
+ from urllib.parse import urlparse
241
+ parsed = urlparse(url)
242
+ domain = parsed.netloc.lower()
243
+ # Remove www. prefix
244
+ if domain.startswith('www.'):
245
+ domain = domain[4:]
246
+ return domain
247
+ except:
248
+ return ""
249
+
250
+ def _create_url_mapping(self, results: List[Dict]) -> Dict[int, str]:
251
+ """Create URL mapping for citations"""
252
+ url_mapping = {}
253
+
254
+ for i, result in enumerate(results):
255
+ url_mapping[i + 1] = result['url']
256
+
257
+ logger.info(f"Created URL mapping for {len(url_mapping)} sources")
258
+ return url_mapping
search/processors/enhanced.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple, Set
3
+ import re
4
+ from collections import defaultdict
5
+ from models.summarizer import summarizer
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class EnhancedContentProcessor:
10
+ """Enhanced content processing for maximum information extraction"""
11
+
12
+ def __init__(self):
13
+ # Cooking content patterns for extraction
14
+ self.cooking_patterns = {
15
+ 'ingredients': [
16
+ r'ingredients?\s+(?:include|are|may include|can include)',
17
+ r'you\s+need',
18
+ r'required\s+ingredients?',
19
+ r'main\s+ingredients?',
20
+ r'key\s+ingredients?'
21
+ ],
22
+ 'techniques': [
23
+ r'techniques?\s+(?:include|are|may include|can include)',
24
+ r'cooking\s+methods?',
25
+ r'preparation\s+methods?',
26
+ r'how\s+to\s+cook',
27
+ r'cooking\s+process'
28
+ ],
29
+ 'instructions': [
30
+ r'instructions?\s+(?:include|are|may include)',
31
+ r'steps?\s+(?:include|are|may include)',
32
+ r'how\s+to\s+make',
33
+ r'preparation\s+steps?',
34
+ r'cooking\s+steps?'
35
+ ],
36
+ 'timing': [
37
+ r'timing\s+(?:include|are|may include)',
38
+ r'cooking\s+time',
39
+ r'preparation\s+time',
40
+ r'total\s+time',
41
+ r'duration'
42
+ ],
43
+ 'tips': [
44
+ r'tips?\s+(?:include|are|may include)',
45
+ r'advice\s+(?:include|are|may include)',
46
+ r'recommendations?',
47
+ r'helpful\s+hints?',
48
+ r'secrets?'
49
+ ],
50
+ 'variations': [
51
+ r'variations?\s+(?:include|are|may include)',
52
+ r'substitutions?\s+(?:include|are|may include)',
53
+ r'alternatives?\s+(?:include|are|may include)',
54
+ r'modifications?\s+(?:include|are|may include)',
55
+ r'complications?'
56
+ ]
57
+ }
58
+
59
+ # Content quality indicators
60
+ self.quality_indicators = {
61
+ 'high': [
62
+ 'professional chef', 'culinary institute', 'food science', 'nutrition research',
63
+ 'evidence-based', 'peer-reviewed', 'published study', 'research shows',
64
+ 'culinary guidelines', 'chef consensus', 'expert opinion'
65
+ ],
66
+ 'medium': [
67
+ 'studies show', 'research indicates', 'culinary literature',
68
+ 'professional experience', 'case studies', 'observational studies'
69
+ ],
70
+ 'low': [
71
+ 'some people', 'may help', 'could be', 'might work',
72
+ 'anecdotal', 'personal experience', 'unverified'
73
+ ]
74
+ }
75
+
76
+ def process_comprehensive_content(self, sources: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
77
+ """Process all sources to extract maximum relevant information"""
78
+ if not sources:
79
+ return "", {}
80
+
81
+ logger.info(f"Processing {len(sources)} sources for comprehensive information extraction")
82
+
83
+ # Extract structured information from each source
84
+ structured_info = self._extract_structured_information(sources, user_query)
85
+
86
+ # Create comprehensive summary
87
+ comprehensive_summary = self._create_comprehensive_summary(structured_info, user_query)
88
+
89
+ # Create detailed reference mapping
90
+ reference_mapping = self._create_detailed_reference_mapping(sources)
91
+
92
+ return comprehensive_summary, reference_mapping
93
+
94
+ def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
95
+ """Extract structured information by medical categories"""
96
+ structured_info = defaultdict(list)
97
+
98
+ for source in sources:
99
+ content = source.get('content', '')
100
+ if not content:
101
+ continue
102
+
103
+ # Extract information by medical categories
104
+ for category, patterns in self.medical_patterns.items():
105
+ extracted_info = self._extract_category_info(content, patterns, category, user_query)
106
+ if extracted_info:
107
+ structured_info[category].append({
108
+ 'content': extracted_info,
109
+ 'source': source,
110
+ 'relevance_score': self._calculate_relevance_score(extracted_info, user_query)
111
+ })
112
+
113
+ # Sort by relevance within each category
114
+ for category in structured_info:
115
+ structured_info[category].sort(key=lambda x: x['relevance_score'], reverse=True)
116
+
117
+ return dict(structured_info)
118
+
119
+ def _extract_category_info(self, content: str, patterns: List[str], category: str, user_query: str) -> str:
120
+ """Extract information for a specific cooking category"""
121
+ extracted_sentences = []
122
+
123
+ # Split content into sentences
124
+ sentences = re.split(r'[.!?]+', content)
125
+
126
+ for sentence in sentences:
127
+ sentence = sentence.strip()
128
+ if len(sentence) < 20: # Skip very short sentences
129
+ continue
130
+
131
+ # Check if sentence matches any pattern for this category
132
+ for pattern in patterns:
133
+ if re.search(pattern, sentence, re.IGNORECASE):
134
+ # Check relevance to user query
135
+ if self._is_relevant_to_query(sentence, user_query):
136
+ extracted_sentences.append(sentence)
137
+ break
138
+
139
+ # Combine and summarize extracted sentences
140
+ if extracted_sentences:
141
+ combined_text = '. '.join(extracted_sentences[:5]) # Limit to top 5 sentences
142
+ return summarizer.summarize_for_query(combined_text, user_query, max_length=300)
143
+
144
+ return ""
145
+
146
+ def _is_relevant_to_query(self, sentence: str, user_query: str) -> bool:
147
+ """Check if sentence is relevant to user query"""
148
+ query_words = set(user_query.lower().split())
149
+ sentence_words = set(sentence.lower().split())
150
+
151
+ # Calculate word overlap
152
+ overlap = len(query_words.intersection(sentence_words))
153
+ return overlap >= 2 # At least 2 words in common
154
+
155
+ def _calculate_relevance_score(self, content: str, user_query: str) -> float:
156
+ """Calculate relevance score for content"""
157
+ if not content or not user_query:
158
+ return 0.0
159
+
160
+ query_words = set(user_query.lower().split())
161
+ content_words = set(content.lower().split())
162
+
163
+ # Word overlap score
164
+ overlap = len(query_words.intersection(content_words))
165
+ overlap_score = overlap / len(query_words) if query_words else 0
166
+
167
+ # Content quality score
168
+ quality_score = self._assess_content_quality(content)
169
+
170
+ # Length score (prefer medium-length content)
171
+ length_score = min(len(content) / 500, 1.0) # Normalize to 0-1
172
+
173
+ # Composite score
174
+ composite_score = (
175
+ overlap_score * 0.5 + # 50% relevance to query
176
+ quality_score * 0.3 + # 30% content quality
177
+ length_score * 0.2 # 20% appropriate length
178
+ )
179
+
180
+ return min(composite_score, 1.0)
181
+
182
+ def _assess_content_quality(self, content: str) -> float:
183
+ """Assess content quality based on cooking indicators"""
184
+ content_lower = content.lower()
185
+
186
+ high_indicators = sum(1 for indicator in self.quality_indicators['high'] if indicator in content_lower)
187
+ medium_indicators = sum(1 for indicator in self.quality_indicators['medium'] if indicator in content_lower)
188
+ low_indicators = sum(1 for indicator in self.quality_indicators['low'] if indicator in content_lower)
189
+
190
+ # Calculate quality score
191
+ if high_indicators > 0:
192
+ return 0.9
193
+ elif medium_indicators > 0:
194
+ return 0.7
195
+ elif low_indicators > 0:
196
+ return 0.5
197
+ else:
198
+ return 0.6 # Default score for neutral content
199
+
200
+ def _create_comprehensive_summary(self, structured_info: Dict[str, List[Dict]], user_query: str) -> str:
201
+ """Create comprehensive summary from structured information"""
202
+ if not structured_info:
203
+ return ""
204
+
205
+ summary_parts = []
206
+
207
+ # Process each category
208
+ category_headers = {
209
+ 'ingredients': "**🥘 Ingredients & Shopping:**",
210
+ 'techniques': "**👨‍🍳 Cooking Techniques:**",
211
+ 'instructions': "**📋 Step-by-Step Instructions:**",
212
+ 'timing': "**⏰ Timing & Preparation:**",
213
+ 'tips': "**💡 Pro Tips & Tricks:**",
214
+ 'variations': "**🔄 Variations & Substitutions:**"
215
+ }
216
+
217
+ for category, info_list in structured_info.items():
218
+ if not info_list:
219
+ continue
220
+
221
+ # Take top 2 most relevant items for each category
222
+ top_items = info_list[:2]
223
+
224
+ category_content = []
225
+ for item in top_items:
226
+ content = item['content']
227
+ if content:
228
+ category_content.append(content)
229
+
230
+ if category_content:
231
+ # Combine and summarize category content
232
+ combined_content = ' '.join(category_content)
233
+ category_summary = summarizer.summarize_for_query(combined_content, user_query, max_length=400)
234
+
235
+ if category_summary:
236
+ header = category_headers.get(category, f"**{category.title()}:**")
237
+ summary_parts.append(f"{header}\n{category_summary}")
238
+
239
+ # Combine all parts
240
+ comprehensive_summary = "\n\n".join(summary_parts)
241
+
242
+ # Final summarization to ensure conciseness
243
+ if len(comprehensive_summary) > 2000:
244
+ comprehensive_summary = summarizer.summarize_text(comprehensive_summary, max_length=2000)
245
+
246
+ return comprehensive_summary
247
+
248
+ def _create_detailed_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
249
+ """Create detailed reference mapping with source metadata"""
250
+ reference_mapping = {}
251
+
252
+ for i, source in enumerate(sources, 1):
253
+ # Be defensive: some upstream sources may miss optional fields
254
+ reference_mapping[i] = {
255
+ 'url': source.get('url', ''),
256
+ 'title': source.get('title', ''),
257
+ 'domain': source.get('domain', ''),
258
+ 'source_type': source.get('source_type', 'text'),
259
+ 'language': source.get('language', 'en'),
260
+ 'type': source.get('type', 'text'),
261
+ 'content_length': len(source.get('content', '')),
262
+ 'composite_score': source.get('composite_score', 0.7)
263
+ }
264
+
265
+ return reference_mapping
266
+
267
+ def create_inline_citations(self, text: str, reference_mapping: Dict[int, Dict]) -> str:
268
+ """Create inline citations within the text"""
269
+ if not reference_mapping:
270
+ return text
271
+
272
+ # Find places where citations should be added
273
+ # This is a simplified version - in practice, you'd use more sophisticated NLP
274
+
275
+ # Add citations after key cooking statements
276
+ citation_patterns = [
277
+ r'(ingredients?\s+(?:include|are)[^.]*\.)',
278
+ r'(techniques?\s+(?:include|are)[^.]*\.)',
279
+ r'(instructions?\s+(?:include|are)[^.]*\.)',
280
+ r'(timing\s+(?:include|are)[^.]*\.)',
281
+ r'(studies?\s+show[^.]*\.)',
282
+ r'(research\s+(?:indicates|shows)[^.]*\.)'
283
+ ]
284
+
285
+ cited_text = text
286
+ citation_count = 1
287
+
288
+ for pattern in citation_patterns:
289
+ matches = re.finditer(pattern, cited_text, re.IGNORECASE)
290
+ for match in matches:
291
+ if citation_count <= len(reference_mapping):
292
+ citation_tag = f" <#{citation_count}>"
293
+ cited_text = cited_text.replace(match.group(1), match.group(1) + citation_tag, 1)
294
+ citation_count += 1
295
+
296
+ return cited_text
297
+
298
+ def generate_source_statistics(self, sources: List[Dict]) -> str:
299
+ """Generate statistics about sources used"""
300
+ if not sources:
301
+ return ""
302
+
303
+ total_sources = len(sources)
304
+ # credibility removed
305
+
306
+ # Language distribution
307
+ languages = defaultdict(int)
308
+ for source in sources:
309
+ lang = source.get('language', 'en')
310
+ languages[lang] += 1
311
+
312
+ # Source type distribution
313
+ source_types = defaultdict(int)
314
+ for source in sources:
315
+ source_type = source.get('source_type', 'other')
316
+ source_types[source_type] += 1
317
+
318
+ # Content length statistics
319
+ content_lengths = [len(s.get('content', '')) for s in sources]
320
+ avg_content_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
321
+
322
+ stats_parts = []
323
+ stats_parts.append(f"**📊 Source Statistics:**")
324
+ stats_parts.append(f"• **Total Sources**: {total_sources}")
325
+ # removed credibility summary
326
+ stats_parts.append(f"• **Languages**: {', '.join([f'{count} {lang}' for lang, count in languages.items()])}")
327
+ stats_parts.append(f"• **Types**: {', '.join([f'{count} {type_name}' for type_name, count in source_types.items()])}")
328
+ stats_parts.append(f"• **Avg Content Length**: {avg_content_length:.0f} characters")
329
+
330
+ return "\n".join(stats_parts)
331
+
search/processors/language.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Dict, Tuple, Optional
4
+ from langdetect import detect, DetectorFactory
5
+ from langdetect.lang_detect_exception import LangDetectException
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Set seed for consistent language detection
10
+ DetectorFactory.seed = 0
11
+
12
+ class LanguageProcessor:
13
+ """Process and enhance queries for multilingual medical search"""
14
+
15
+ def __init__(self):
16
+ # Medical keywords in different languages
17
+ self.medical_keywords = {
18
+ 'en': [
19
+ 'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
20
+ 'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
21
+ 'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
22
+ 'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
23
+ 'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
24
+ 'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
25
+ 'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
26
+ 'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
27
+ 'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
28
+ ],
29
+ 'vi': [
30
+ 'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
31
+ 'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
32
+ 'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
33
+ 'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
34
+ 'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
35
+ 'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
36
+ 'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
37
+ 'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
38
+ 'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
39
+ ],
40
+ 'zh': [
41
+ '症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
42
+ '药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
43
+ '医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
44
+ '背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
45
+ '睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
46
+ '婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
47
+ '手术', '医院', '诊所'
48
+ ]
49
+ }
50
+
51
+ # Language-specific search enhancements
52
+ self.language_enhancements = {
53
+ 'vi': {
54
+ 'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
55
+ 'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
56
+ },
57
+ 'zh': {
58
+ 'common_terms': ['是什么', '原因', '治疗方法', '症状'],
59
+ 'medical_context': ['医疗', '健康', '医院', '医生']
60
+ },
61
+ 'en': {
62
+ 'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
63
+ 'medical_context': ['medical', 'health', 'hospital', 'doctor']
64
+ }
65
+ }
66
+
67
+ def detect_language(self, text: str) -> str:
68
+ """Detect the language of the input text"""
69
+ if not text or not text.strip():
70
+ return 'en' # Default to English
71
+
72
+ try:
73
+ # Clean text for better detection
74
+ cleaned_text = re.sub(r'[^\w\s]', ' ', text)
75
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
76
+
77
+ if len(cleaned_text) < 3:
78
+ return 'en'
79
+
80
+ detected = detect(cleaned_text)
81
+
82
+ # Map detected language to our supported languages
83
+ language_mapping = {
84
+ 'vi': 'vi', # Vietnamese
85
+ 'zh-cn': 'zh', # Chinese Simplified
86
+ 'zh-tw': 'zh', # Chinese Traditional
87
+ 'zh': 'zh', # Chinese
88
+ 'en': 'en' # English
89
+ }
90
+
91
+ return language_mapping.get(detected, 'en')
92
+
93
+ except LangDetectException as e:
94
+ logger.warning(f"Language detection failed: {e}")
95
+ return 'en'
96
+
97
+ def enhance_query(self, query: str, target_language: str = None) -> Dict[str, str]:
98
+ """Enhance query for better search results in multiple languages"""
99
+ if not query or not query.strip():
100
+ return {}
101
+
102
+ # Detect source language
103
+ source_language = self.detect_language(query)
104
+
105
+ # If target language not specified, use source language
106
+ if target_language is None:
107
+ target_language = source_language
108
+
109
+ enhanced_queries = {}
110
+
111
+ # Original query
112
+ enhanced_queries[source_language] = query
113
+
114
+ # Enhance for source language
115
+ if source_language in self.language_enhancements:
116
+ enhanced_queries[source_language] = self._enhance_for_language(
117
+ query, source_language
118
+ )
119
+
120
+ # Create translations for other languages if needed
121
+ if target_language != source_language:
122
+ enhanced_queries[target_language] = self._translate_query(
123
+ query, source_language, target_language
124
+ )
125
+
126
+ # Add English version for comprehensive search
127
+ if 'en' not in enhanced_queries:
128
+ if source_language != 'en':
129
+ enhanced_queries['en'] = self._translate_query(query, source_language, 'en')
130
+ else:
131
+ enhanced_queries['en'] = query
132
+
133
+ return enhanced_queries
134
+
135
+ def _enhance_for_language(self, query: str, language: str) -> str:
136
+ """Enhance query for a specific language"""
137
+ enhancements = self.language_enhancements.get(language, {})
138
+ common_terms = enhancements.get('common_terms', [])
139
+ medical_context = enhancements.get('medical_context', [])
140
+
141
+ # Check if query already contains medical context
142
+ query_lower = query.lower()
143
+ has_medical_context = any(term in query_lower for term in medical_context)
144
+
145
+ # If no medical context, add it
146
+ if not has_medical_context and medical_context:
147
+ # Add the most relevant medical context term
148
+ query += f" {medical_context[0]}"
149
+
150
+ # Check if query is a question and add relevant terms
151
+ if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
152
+ if common_terms:
153
+ query += f" {common_terms[0]}" # Add "causes" or equivalent
154
+
155
+ return query.strip()
156
+
157
+ def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
158
+ """Simple keyword-based translation for medical terms"""
159
+ # This is a basic implementation - in production, you'd use a proper translation service
160
+
161
+ # Medical term translations
162
+ translations = {
163
+ ('vi', 'en'): {
164
+ 'triệu chứng': 'symptoms',
165
+ 'đau': 'pain',
166
+ 'đau đầu': 'headache',
167
+ 'sốt': 'fever',
168
+ 'ho': 'cough',
169
+ 'điều trị': 'treatment',
170
+ 'thuốc': 'medicine',
171
+ 'bệnh': 'disease',
172
+ 'bác sĩ': 'doctor',
173
+ 'sức khỏe': 'health',
174
+ 'bệnh viện': 'hospital'
175
+ },
176
+ ('zh', 'en'): {
177
+ '症状': 'symptoms',
178
+ '疼痛': 'pain',
179
+ '头痛': 'headache',
180
+ '发烧': 'fever',
181
+ '咳嗽': 'cough',
182
+ '治疗': 'treatment',
183
+ '药物': 'medicine',
184
+ '疾病': 'disease',
185
+ '医生': 'doctor',
186
+ '健康': 'health',
187
+ '医院': 'hospital'
188
+ },
189
+ ('en', 'vi'): {
190
+ 'symptoms': 'triệu chứng',
191
+ 'pain': 'đau',
192
+ 'headache': 'đau đầu',
193
+ 'fever': 'sốt',
194
+ 'cough': 'ho',
195
+ 'treatment': 'điều trị',
196
+ 'medicine': 'thuốc',
197
+ 'disease': 'bệnh',
198
+ 'doctor': 'bác sĩ',
199
+ 'health': 'sức khỏe',
200
+ 'hospital': 'bệnh viện'
201
+ },
202
+ ('en', 'zh'): {
203
+ 'symptoms': '症状',
204
+ 'pain': '疼痛',
205
+ 'headache': '头痛',
206
+ 'fever': '发烧',
207
+ 'cough': '咳嗽',
208
+ 'treatment': '治疗',
209
+ 'medicine': '药物',
210
+ 'disease': '疾病',
211
+ 'doctor': '医生',
212
+ 'health': '健康',
213
+ 'hospital': '医院'
214
+ }
215
+ }
216
+
217
+ translation_map = translations.get((source_lang, target_lang), {})
218
+
219
+ # Simple word-by-word translation
220
+ translated_query = query
221
+ for source_term, target_term in translation_map.items():
222
+ translated_query = translated_query.replace(source_term, target_term)
223
+
224
+ return translated_query
225
+
226
+ def get_medical_relevance_score(self, text: str, language: str) -> float:
227
+ """Calculate medical relevance score for text in a specific language"""
228
+ if not text:
229
+ return 0.0
230
+
231
+ keywords = self.medical_keywords.get(language, [])
232
+ if not keywords:
233
+ return 0.0
234
+
235
+ text_lower = text.lower()
236
+ matches = sum(1 for keyword in keywords if keyword in text_lower)
237
+
238
+ # Normalize by text length and keyword count
239
+ score = matches / max(len(keywords), 1)
240
+
241
+ # Boost score for longer matches
242
+ if matches > 0:
243
+ score *= (1 + matches * 0.1)
244
+
245
+ return min(score, 1.0)
246
+
247
+ def filter_by_language(self, results: List[Dict], target_language: str) -> List[Dict]:
248
+ """Filter results by language preference"""
249
+ if not results:
250
+ return results
251
+
252
+ # Score results by language match
253
+ scored_results = []
254
+ for result in results:
255
+ result_language = result.get('language', 'en')
256
+ language_score = 1.0 if result_language == target_language else 0.5
257
+
258
+ # Add language score to result
259
+ result_copy = result.copy()
260
+ result_copy['language_score'] = language_score
261
+ scored_results.append(result_copy)
262
+
263
+ # Sort by language score (prefer target language)
264
+ scored_results.sort(key=lambda x: x.get('language_score', 0), reverse=True)
265
+
266
+ return scored_results
search/processors/sources.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple, Set
3
+ import re
4
+ from urllib.parse import urlparse
5
+ from collections import defaultdict
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class SourceAggregator:
10
+ """Aggregate and process sources for comprehensive information extraction"""
11
+
12
+ def __init__(self):
13
+ # (Removed credibility scoring; keep placeholder map for future use)
14
+ self.source_credibility = {
15
+ # English sources
16
+ 'mayoclinic.org': 0.95,
17
+ 'webmd.com': 0.90,
18
+ 'healthline.com': 0.88,
19
+ 'medlineplus.gov': 0.95,
20
+ 'nih.gov': 0.98,
21
+ 'cdc.gov': 0.98,
22
+ 'who.int': 0.97,
23
+ 'pubmed.ncbi.nlm.nih.gov': 0.96,
24
+ 'uptodate.com': 0.94,
25
+ 'merckmanuals.com': 0.92,
26
+ 'medscape.com': 0.89,
27
+
28
+ # Vietnamese sources
29
+ 'hellobacsi.com': 0.85,
30
+ 'alobacsi.com': 0.82,
31
+ 'vinmec.com': 0.88,
32
+ 'tamanhhospital.vn': 0.85,
33
+ 'medlatec.vn': 0.83,
34
+ 'suckhoedoisong.vn': 0.90,
35
+ 'viendinhduong.vn': 0.87,
36
+
37
+ # Chinese sources
38
+ 'haodf.com': 0.86,
39
+ 'dxy.cn': 0.89,
40
+ 'chunyuyisheng.com': 0.84,
41
+ 'xywy.com': 0.82,
42
+ 'jiankang.com': 0.80,
43
+ 'familydoctor.com.cn': 0.85,
44
+
45
+ # Video platforms
46
+ 'youtube.com': 0.70,
47
+ 'medscape.com': 0.89
48
+ }
49
+
50
+ # Source type classification
51
+ self.source_types = {
52
+ 'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
53
+ 'hospital': ['mayoclinic.org', 'vinmec.com', 'tamanhhospital.vn'],
54
+ 'commercial': ['webmd.com', 'healthline.com', 'hellobacsi.com'],
55
+ 'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
56
+ 'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
57
+ 'video': ['youtube.com', 'medscape.com']
58
+ }
59
+
60
+ def aggregate_sources(self, search_results: List[Dict], video_results: List[Dict] = None) -> Dict[str, any]:
61
+ """Aggregate all sources and create comprehensive reference system"""
62
+ all_sources = []
63
+
64
+ # Process search results
65
+ for result in search_results:
66
+ source_info = self._process_source(result)
67
+ if source_info:
68
+ all_sources.append(source_info)
69
+
70
+ # Process video results
71
+ if video_results:
72
+ for video in video_results:
73
+ video_info = self._process_video_source(video)
74
+ if video_info:
75
+ all_sources.append(video_info)
76
+
77
+ # Remove duplicates and score sources
78
+ unique_sources = self._deduplicate_sources(all_sources)
79
+ scored_sources = self._score_sources(unique_sources)
80
+
81
+ # Create comprehensive reference mapping
82
+ reference_mapping = self._create_reference_mapping(scored_sources)
83
+
84
+ # Generate source summary
85
+ source_summary = self._generate_source_summary(scored_sources)
86
+
87
+ return {
88
+ 'sources': scored_sources,
89
+ 'reference_mapping': reference_mapping,
90
+ 'source_summary': source_summary,
91
+ 'total_sources': len(scored_sources),
92
+ 'languages': self._get_language_distribution(scored_sources),
93
+ 'source_types': self._get_source_type_distribution(scored_sources)
94
+ }
95
+
96
+ def _process_source(self, result: Dict) -> Dict:
97
+ """Process a single search result into standardized source format"""
98
+ url = (result or {}).get('url', '')
99
+ if not url:
100
+ return None
101
+
102
+ domain = self._extract_domain(url)
103
+ source_type = self._classify_source_type(domain)
104
+ # Normalize fields with safe defaults
105
+ title = str(result.get('title', '') or '').strip()
106
+ content = str(result.get('content', '') or '')
107
+ language = (result.get('language') or 'en').lower()
108
+ source_name = str(result.get('source', '') or '')
109
+ platform = str(result.get('platform', '') or '')
110
+
111
+ return {
112
+ 'url': url,
113
+ 'title': title,
114
+ 'content': content,
115
+ 'domain': domain,
116
+ 'source_type': source_type,
117
+ 'language': language,
118
+ 'source_name': source_name,
119
+ 'platform': platform,
120
+ 'type': 'text'
121
+ }
122
+
123
+ def _process_video_source(self, video: Dict) -> Dict:
124
+ """Process a video result into standardized source format"""
125
+ url = (video or {}).get('url', '')
126
+ if not url:
127
+ return None
128
+
129
+ domain = self._extract_domain(url)
130
+ source_type = 'video'
131
+ title = str(video.get('title', '') or '').strip()
132
+ language = (video.get('language') or 'en').lower()
133
+ source_name = str(video.get('source', '') or '')
134
+ platform = str(video.get('platform', '') or '')
135
+ return {
136
+ 'url': url,
137
+ 'title': title,
138
+ 'content': '', # Videos don't have text content
139
+ 'domain': domain,
140
+ 'source_type': source_type,
141
+ 'language': language,
142
+ 'source_name': source_name,
143
+ 'platform': platform,
144
+ 'type': 'video'
145
+ }
146
+
147
+ def _extract_domain(self, url: str) -> str:
148
+ """Extract domain from URL"""
149
+ try:
150
+ parsed = urlparse(url)
151
+ domain = parsed.netloc.lower()
152
+ # Remove www. prefix
153
+ if domain.startswith('www.'):
154
+ domain = domain[4:]
155
+ return domain
156
+ except:
157
+ return ''
158
+
159
+ def _classify_source_type(self, domain: str) -> str:
160
+ """Classify source type based on domain"""
161
+ for source_type, domains in self.source_types.items():
162
+ if domain in domains:
163
+ return source_type
164
+ return 'other'
165
+
166
+ def _get_source_credibility(self, domain: str) -> float:
167
+ """Deprecated: credibility scoring removed. Kept for compatibility."""
168
+ return 0.0
169
+
170
+ def _deduplicate_sources(self, sources: List[Dict]) -> List[Dict]:
171
+ """Remove duplicate sources based on URL and title similarity"""
172
+ seen_urls = set()
173
+ seen_titles = set()
174
+ unique_sources = []
175
+
176
+ for source in sources:
177
+ url = source.get('url', '')
178
+ title = source.get('title', '').lower().strip()
179
+
180
+ # Check for URL duplicates
181
+ if url in seen_urls:
182
+ continue
183
+
184
+ # Check for title similarity (fuzzy matching)
185
+ title_similar = any(self._titles_similar(title, seen_title) for seen_title in seen_titles)
186
+ if title_similar:
187
+ continue
188
+
189
+ seen_urls.add(url)
190
+ seen_titles.add(title)
191
+ unique_sources.append(source)
192
+
193
+ return unique_sources
194
+
195
+ def _titles_similar(self, title1: str, title2: str, threshold: float = 0.8) -> bool:
196
+ """Check if two titles are similar (simple word overlap)"""
197
+ if not title1 or not title2:
198
+ return False
199
+
200
+ words1 = set(title1.split())
201
+ words2 = set(title2.split())
202
+
203
+ if not words1 or not words2:
204
+ return False
205
+
206
+ intersection = words1.intersection(words2)
207
+ union = words1.union(words2)
208
+
209
+ similarity = len(intersection) / len(union) if union else 0
210
+ return similarity >= threshold
211
+
212
+ def _score_sources(self, sources: List[Dict]) -> List[Dict]:
213
+ """Score and rank sources by relevance and credibility"""
214
+ for source in sources:
215
+ # Calculate composite score
216
+ content_length = len(source.get('content', ''))
217
+ title_length = len(source.get('title', ''))
218
+
219
+ # Content quality score
220
+ content_score = min(content_length / 1000, 1.0) # Normalize to 0-1
221
+
222
+ # Title quality score
223
+ title_score = min(title_length / 100, 1.0) # Normalize to 0-1
224
+
225
+ # Composite score (weighted)
226
+ composite_score = (
227
+ content_score * 0.6 + # 60% content quality
228
+ title_score * 0.4 # 40% title quality
229
+ )
230
+
231
+ source['composite_score'] = composite_score
232
+
233
+ # Sort by composite score
234
+ sources.sort(key=lambda x: x.get('composite_score', 0), reverse=True)
235
+
236
+ return sources
237
+
238
+ def _create_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
239
+ """Create reference mapping for citations"""
240
+ reference_mapping = {}
241
+
242
+ for i, source in enumerate(sources, 1):
243
+ reference_mapping[i] = {
244
+ 'url': source['url'],
245
+ 'title': source['title'],
246
+ 'domain': source['domain'],
247
+ 'source_type': source['source_type'],
248
+ 'language': source['language'],
249
+ 'type': source['type']
250
+ }
251
+
252
+ return reference_mapping
253
+
254
+ def _generate_source_summary(self, sources: List[Dict]) -> str:
255
+ """Generate summary of sources used"""
256
+ if not sources:
257
+ return "No sources available."
258
+
259
+ # Group by source type
260
+ type_counts = defaultdict(int)
261
+ language_counts = defaultdict(int)
262
+ # credibility removed
263
+
264
+ for source in sources:
265
+ source_type = source.get('source_type', 'other')
266
+ language = source.get('language', 'en')
267
+ type_counts[source_type] += 1
268
+ language_counts[language] += 1
269
+
270
+ # Generate summary
271
+ summary_parts = []
272
+ summary_parts.append(f"**Sources Used ({len(sources)} total):**")
273
+
274
+ # Source types
275
+ if type_counts:
276
+ type_summary = ", ".join([f"{count} {type_name}" for type_name, count in type_counts.items()])
277
+ summary_parts.append(f"• **Types**: {type_summary}")
278
+
279
+ # Languages
280
+ if language_counts:
281
+ lang_summary = ", ".join([f"{count} {lang}" for lang, count in language_counts.items()])
282
+ summary_parts.append(f"• **Languages**: {lang_summary}")
283
+
284
+ # Credibility
285
+ # credibility info removed
286
+
287
+ return "\n".join(summary_parts)
288
+
289
+ def _get_language_distribution(self, sources: List[Dict]) -> Dict[str, int]:
290
+ """Get distribution of sources by language"""
291
+ distribution = defaultdict(int)
292
+ for source in sources:
293
+ language = source.get('language', 'en')
294
+ distribution[language] += 1
295
+ return dict(distribution)
296
+
297
+ def _get_source_type_distribution(self, sources: List[Dict]) -> Dict[str, int]:
298
+ """Get distribution of sources by type"""
299
+ distribution = defaultdict(int)
300
+ for source in sources:
301
+ source_type = source.get('source_type', 'other')
302
+ distribution[source_type] += 1
303
+ return dict(distribution)
304
+
305
+ def create_comprehensive_references(self, sources: List[Dict], max_references: int = 15) -> str:
306
+ """Create comprehensive reference list for the response"""
307
+ if not sources:
308
+ return ""
309
+
310
+ # Take top sources
311
+ top_sources = sources[:max_references]
312
+
313
+ reference_parts = []
314
+ reference_parts.append("**📚 References:**")
315
+
316
+ for i, source in enumerate(top_sources, 1):
317
+ url = source.get('url', '')
318
+ title = source.get('title', '')
319
+ domain = source.get('domain', '')
320
+ source_type = source.get('source_type', 'other')
321
+ # credibility removed
322
+ language = source.get('language', 'en')
323
+ source_type_icon = source.get('type', 'other')
324
+
325
+ # Create type indicator
326
+ type_icons = {
327
+ 'academic': '🎓',
328
+ 'hospital': '🏥',
329
+ 'government': '🏛️',
330
+ 'commercial': '💼',
331
+ 'professional': '👨‍⚕️',
332
+ 'video': '📹',
333
+ 'other': '📄'
334
+ }
335
+ type_icon = type_icons.get(source_type, '📄')
336
+
337
+ # Create language indicator
338
+ lang_icons = {
339
+ 'en': '🇺🇸',
340
+ 'vi': '🇻🇳',
341
+ 'zh': '🇨🇳'
342
+ }
343
+ lang_icon = lang_icons.get(language, '🌐')
344
+
345
+ reference_line = f"{i}. {type_icon} {lang_icon} [{title}]({url}) - {domain}"
346
+ reference_parts.append(reference_line)
347
+
348
+ if len(sources) > max_references:
349
+ reference_parts.append(f"... and {len(sources) - max_references} more sources")
350
+
351
+ return "\n".join(reference_parts)
352
+
search/search.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ import time
4
+ import hashlib
5
+ from .engines.duckduckgo import DuckDuckGoEngine
6
+ from .engines.video import VideoSearchEngine
7
+ from .coordinator import SearchCoordinator
8
+ # Reranker removed - using simple relevance scoring for cooking content
9
+ from models import summarizer
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Global instances
14
+ _duckduckgo_engine = None
15
+ _video_engine = None
16
+ _reranker = None
17
+ _search_coordinator = None
18
+
19
+ # Simple in-memory cache for search results
20
+ _search_cache = {}
21
+ _cache_ttl = 300 # 5 minutes TTL
22
+
23
+ def get_duckduckgo_engine() -> DuckDuckGoEngine:
24
+ """Get or create the global DuckDuckGo engine instance"""
25
+ global _duckduckgo_engine
26
+ if _duckduckgo_engine is None:
27
+ _duckduckgo_engine = DuckDuckGoEngine()
28
+ return _duckduckgo_engine
29
+
30
+ def get_video_engine() -> VideoSearchEngine:
31
+ """Get or create the global video engine instance"""
32
+ global _video_engine
33
+ if _video_engine is None:
34
+ _video_engine = VideoSearchEngine()
35
+ return _video_engine
36
+
37
+ def get_reranker():
38
+ """Simple cooking relevance scorer - no complex reranking needed"""
39
+ return None
40
+
41
+ def get_search_coordinator() -> SearchCoordinator:
42
+ """Get or create the global search coordinator instance"""
43
+ global _search_coordinator
44
+ if _search_coordinator is None:
45
+ _search_coordinator = SearchCoordinator()
46
+ return _search_coordinator
47
+
48
+ def _clean_search_query(query: str) -> str:
49
+ """Clean search query by removing bullet points and special characters"""
50
+ if not query:
51
+ return ""
52
+
53
+ import re
54
+ # Remove bullet points and special characters
55
+ cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query)
56
+ cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned)
57
+ cleaned = re.sub(r'\s+', ' ', cleaned)
58
+ cleaned = cleaned.strip()
59
+
60
+ # Remove common prefixes that might confuse search
61
+ prefixes_to_remove = [
62
+ r'^(en|vi|zh)\s*:\s*',
63
+ r'^(search|find|look for)\s+',
64
+ r'^(how to|what is|what are)\s+',
65
+ ]
66
+
67
+ for prefix in prefixes_to_remove:
68
+ cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
69
+
70
+ return cleaned.strip()
71
+
72
+ def _boost_cooking_keywords(query: str) -> str:
73
+ """Add cooking context keywords to improve search relevance"""
74
+ if not query:
75
+ return ""
76
+
77
+ # Cooking keywords that boost relevance
78
+ cooking_boosters = [
79
+ 'recipe', 'cooking', 'culinary', 'technique', 'how to', 'bake', 'roast', 'sear', 'simmer',
80
+ 'ingredients', 'measurements', 'temperature', 'timing', 'substitution', 'variation', 'tips'
81
+ ]
82
+
83
+ query_lower = query.lower()
84
+
85
+ # If query doesn't contain cooking terms, add context
86
+ has_cooking = any(term in query_lower for term in cooking_boosters)
87
+
88
+ if not has_cooking:
89
+ # Add cooking context without being too verbose
90
+ if len(query.split()) <= 3:
91
+ return f"{query} cooking recipe technique"
92
+ else:
93
+ return f"{query} cooking tutorial"
94
+
95
+ return query
96
+
97
+ def _get_cache_key(query: str, num_results: int, target_language: str = None, include_videos: bool = True) -> str:
98
+ """Generate cache key for search results"""
99
+ cache_data = f"{query}_{num_results}_{target_language}_{include_videos}"
100
+ return hashlib.md5(cache_data.encode()).hexdigest()
101
+
102
+ def _get_cached_results(cache_key: str) -> Tuple[str, Dict[int, str], Dict]:
103
+ """Get cached search results if available and not expired"""
104
+ if cache_key not in _search_cache:
105
+ return None, None, None
106
+
107
+ cached_data = _search_cache[cache_key]
108
+ if time.time() - cached_data['timestamp'] > _cache_ttl:
109
+ # Cache expired
110
+ del _search_cache[cache_key]
111
+ return None, None, None
112
+
113
+ logger.info(f"Using cached search results for key: {cache_key[:8]}...")
114
+ return cached_data['search_context'], cached_data['url_mapping'], cached_data['source_aggregation']
115
+
116
+ def _cache_results(cache_key: str, search_context: str, url_mapping: Dict[int, str], source_aggregation: Dict):
117
+ """Cache search results"""
118
+ _search_cache[cache_key] = {
119
+ 'search_context': search_context,
120
+ 'url_mapping': url_mapping,
121
+ 'source_aggregation': source_aggregation,
122
+ 'timestamp': time.time()
123
+ }
124
+ logger.info(f"Cached search results for key: {cache_key[:8]}...")
125
+
126
+ class WebSearcher:
127
+ """Legacy wrapper for backward compatibility"""
128
+ def __init__(self):
129
+ self.coordinator = get_search_coordinator()
130
+ self.max_results = 10
131
+ self.timeout = 10
132
+
133
+ def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
134
+ """Search using the new coordinator system"""
135
+ try:
136
+ cleaned_query = _clean_search_query(query)
137
+ return self.coordinator.quick_search(cleaned_query, num_results)
138
+ except Exception as e:
139
+ logger.error(f"Search failed: {e}")
140
+ return []
141
+
142
+ def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
143
+ """Search using DuckDuckGo engine"""
144
+ try:
145
+ cleaned_query = _clean_search_query(query)
146
+ return self.coordinator.quick_search(cleaned_query, num_results)
147
+ except Exception as e:
148
+ logger.error(f"DuckDuckGo search failed: {e}")
149
+ return []
150
+
151
+ def extract_content(self, url: str) -> str:
152
+ """Extract content using the new content extractor"""
153
+ try:
154
+ return self.coordinator.content_extractor.extract(url)
155
+ except Exception as e:
156
+ logger.error(f"Content extraction failed: {e}")
157
+ return ""
158
+
159
+ def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
160
+ """Search and extract content using the new system"""
161
+ try:
162
+ # Clean the query first
163
+ cleaned_query = _clean_search_query(query)
164
+ # Get search results
165
+ results = self.coordinator.quick_search(cleaned_query, num_results)
166
+
167
+ # Extract content for each result
168
+ enriched_results = []
169
+ for result in results:
170
+ content = self.extract_content(result['url'])
171
+ if content:
172
+ enriched_result = result.copy()
173
+ enriched_result['content'] = content
174
+ enriched_results.append(enriched_result)
175
+ return enriched_results
176
+ except Exception as e:
177
+ logger.error(f"Search and extract failed: {e}")
178
+ return []
179
+
180
+ # Main search function for backward compatibility
181
+ def search_web(query: str, num_results: int = 10) -> List[Dict]:
182
+ """Main search function using the new coordinator system"""
183
+ try:
184
+ # Clean the query first
185
+ cleaned_query = _clean_search_query(query)
186
+ coordinator = get_search_coordinator()
187
+ return coordinator.quick_search(cleaned_query, num_results)
188
+ except Exception as e:
189
+ logger.error(f"Web search failed: {e}")
190
+ return []
191
+
192
+ # Enhanced search function with content extraction
193
+ def search_web_with_content(query: str, num_results: int = 10) -> Tuple[str, Dict[int, str]]:
194
+ """Enhanced search with content extraction and summarization"""
195
+ try:
196
+ # Clean the query first
197
+ cleaned_query = _clean_search_query(query)
198
+ coordinator = get_search_coordinator()
199
+ return coordinator.search(cleaned_query, num_results)
200
+ except Exception as e:
201
+ logger.error(f"Enhanced web search failed: {e}")
202
+ return "", {}
203
+
204
+ # Cooking-focused search function
205
+ def search_cooking(query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
206
+ """Cooking-focused search with enhanced processing"""
207
+ try:
208
+ # Clean the query first
209
+ cleaned_query = _clean_search_query(query)
210
+ coordinator = get_search_coordinator()
211
+ return coordinator.cooking_focus_search(cleaned_query, num_results)
212
+ except Exception as e:
213
+ logger.error(f"Cooking search failed: {e}")
214
+ return "", {}
215
+
216
+ # Multilingual cooking search function
217
+ def search_multilingual_cooking(query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
218
+ """Comprehensive multilingual cooking search supporting English, Vietnamese, and Chinese"""
219
+ try:
220
+ # Clean the query first
221
+ cleaned_query = _clean_search_query(query)
222
+ coordinator = get_search_coordinator()
223
+ return coordinator.multilingual_cooking_search(cleaned_query, num_results, target_language)
224
+ except Exception as e:
225
+ logger.error(f"Multilingual cooking search failed: {e}")
226
+ return "", {}
227
+
228
+ # Video search function
229
+ def search_videos(query: str, num_results: int = 2, target_language: str = None) -> List[Dict]:
230
+ """Search for cooking videos across multiple platforms"""
231
+ try:
232
+ # Clean the query first
233
+ cleaned_query = _clean_search_query(query)
234
+ coordinator = get_search_coordinator()
235
+ return coordinator.video_search(cleaned_query, num_results, target_language)
236
+ except Exception as e:
237
+ logger.error(f"Video search failed: {e}")
238
+ return []
239
+
240
+ # Comprehensive search function with maximum information extraction
241
+ def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
242
+ """Comprehensive search with maximum information extraction and detailed references"""
243
+ logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
244
+
245
+ # Check cache first
246
+ cache_key = _get_cache_key(query, num_results, target_language, include_videos)
247
+ cached_context, cached_mapping, cached_aggregation = _get_cached_results(cache_key)
248
+ if cached_context is not None:
249
+ return cached_context, cached_mapping, cached_aggregation
250
+
251
+ # Clean and boost the query for better cooking relevance
252
+ cleaned_query = _clean_search_query(query)
253
+ boosted_query = _boost_cooking_keywords(cleaned_query)
254
+ logger.info(f"Query processing: '{query}' -> '{cleaned_query}' -> '{boosted_query}'")
255
+
256
+ # Get engines
257
+ duckduckgo_engine = get_duckduckgo_engine()
258
+ video_engine = get_video_engine()
259
+ reranker = get_reranker()
260
+
261
+ # Optimized search strategy: get just enough results for good filtering
262
+ # Calculate optimal initial count based on expected filtering ratio
263
+ expected_filter_ratio = 0.4 # Expect to keep ~40% after filtering
264
+ optimal_initial_count = max(num_results * 2, int(num_results / expected_filter_ratio))
265
+
266
+ # Search for text results with optimized count
267
+ text_results = duckduckgo_engine.search(boosted_query, optimal_initial_count)
268
+ logger.info(f"Found {len(text_results)} text results (requested {optimal_initial_count})")
269
+
270
+ # If no text results, try simple fallback search
271
+ if not text_results:
272
+ logger.warning("No text results found, trying simple fallback search")
273
+ try:
274
+ # Try with a very simple query
275
+ simple_query = " ".join(cleaned_query.split()[:3]) # First 3 words only
276
+ text_results = duckduckgo_engine.search(simple_query, num_results)
277
+ logger.info(f"Simple fallback found {len(text_results)} results")
278
+ except Exception as e:
279
+ logger.warning(f"Simple fallback search failed: {e}")
280
+
281
+ # Search for videos if requested (limit to avoid over-fetching)
282
+ video_results = []
283
+ if include_videos:
284
+ try:
285
+ # Map language codes for video search
286
+ lang_mapping = {
287
+ 'EN': 'en',
288
+ 'VI': 'vi',
289
+ 'ZH': 'zh',
290
+ 'en': 'en',
291
+ 'vi': 'vi',
292
+ 'zh': 'zh'
293
+ }
294
+ search_language = lang_mapping.get(target_language, 'en')
295
+ # Limit video results to avoid over-fetching
296
+ max_video_results = min(5, num_results // 3) # Max 5 or 1/3 of total
297
+ video_results = video_engine.search(boosted_query, num_results=max_video_results, language=search_language)
298
+ logger.info(f"Found {len(video_results)} video results")
299
+ except Exception as e:
300
+ logger.warning(f"Video search failed: {e}")
301
+
302
+ # Combine all results
303
+ all_results = text_results + video_results
304
+
305
+ # Simple cooking relevance filtering
306
+ if all_results:
307
+ # Filter by cooking relevance using simple keyword matching
308
+ cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
309
+ relevant_results = []
310
+ for result in all_results:
311
+ title = result.get('title', '').lower()
312
+ content = result.get('content', '').lower()
313
+ if any(keyword in title or keyword in content for keyword in cooking_keywords):
314
+ relevant_results.append(result)
315
+
316
+ if relevant_results:
317
+ all_results = relevant_results
318
+ logger.info(f"Filtered to {len(all_results)} cooking-relevant results")
319
+
320
+ # Limit final results to requested count
321
+ all_results = all_results[:num_results]
322
+
323
+ # Final safety check - ensure we have at least some results
324
+ if not all_results and text_results:
325
+ logger.warning("No results after processing, using original text results as fallback")
326
+ all_results = text_results[:num_results]
327
+
328
+ # Create URL mapping
329
+ url_mapping = {}
330
+ for i, result in enumerate(all_results, 1):
331
+ url_mapping[i] = result.get('url', '')
332
+
333
+ # Create search context using summarizer (only for top results)
334
+ search_context = ""
335
+ if all_results:
336
+ summaries = []
337
+ # Only summarize top results to avoid over-processing
338
+ top_results = all_results[:min(10, len(all_results))]
339
+ for i, result in enumerate(top_results, 1):
340
+ content = result.get('content', '') or result.get('title', '')
341
+ if content:
342
+ # Use query-focused summarization
343
+ summary = summarizer.summarize_for_query(content, boosted_query, max_length=300)
344
+ if summary:
345
+ summaries.append(f"Document {i}: {summary}")
346
+
347
+ search_context = "\n\n".join(summaries)
348
+
349
+ # Create source aggregation
350
+ source_aggregation = {
351
+ 'total_sources': len(all_results),
352
+ 'text_sources': len(text_results),
353
+ 'video_sources': len(video_results),
354
+ 'sources': all_results
355
+ }
356
+
357
+ logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
358
+
359
+ # Cache the results
360
+ _cache_results(cache_key, search_context, url_mapping, source_aggregation)
361
+
362
+ return search_context, url_mapping, source_aggregation
utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Utils package
2
+ from .translation import translate_query
3
+ from .vlm import process_medical_image
4
+ from .diagnosis import retrieve_diagnosis_from_symptoms
utils/migrate.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Running this script to split FAISS index collection to the second/different cluster.
2
+ from pymongo import MongoClient
3
+ from dotenv import load_dotenv
4
+ import os
5
+
6
+ def migrate_faiss_index():
7
+ """Migrate FAISS index from QA cluster to index cluster"""
8
+ # Load environment variables from .env
9
+ load_dotenv()
10
+ # Connection strings (update as needed)
11
+ mongo_uri = os.getenv("MONGO_URI") # QA cluster connection string
12
+ index_uri = os.getenv("INDEX_URI") # FAISS index cluster connection string
13
+
14
+ if not mongo_uri:
15
+ raise ValueError("MONGO_URI is missing!")
16
+ if not index_uri:
17
+ raise ValueError("INDEX_URI is missing!")
18
+
19
+ # Connect to the QA cluster (where FAISS data was accidentally stored)
20
+ qa_client = MongoClient(mongo_uri)
21
+ qa_db = qa_client["MedicalChatbotDB"]
22
+
23
+ # Connect to the FAISS index cluster
24
+ faiss_client = MongoClient(index_uri)
25
+ faiss_db = faiss_client["MedicalChatbotDB"] # Use the same database name if desired
26
+
27
+ # Define the GridFS collections to move.
28
+ # In GridFS, files are stored in two collections: "<bucket>.files" and "<bucket>.chunks".
29
+ source_files = qa_db["faiss_index_files.files"]
30
+ source_chunks = qa_db["faiss_index_files.chunks"]
31
+
32
+ dest_files = faiss_db["faiss_index_files.files"]
33
+ dest_chunks = faiss_db["faiss_index_files.chunks"]
34
+
35
+ print("Moving FAISS index GridFS files...")
36
+
37
+ # Copy documents from the source 'files' collection
38
+ for doc in source_files.find():
39
+ dest_files.insert_one(doc)
40
+
41
+ # Copy documents from the source 'chunks' collection
42
+ for doc in source_chunks.find():
43
+ dest_chunks.insert_one(doc)
44
+
45
+ print("✅ FAISS GridFS collections moved successfully.")
46
+
47
+ # Optionally, drop the old collections from the QA cluster to free up space:
48
+ qa_db.drop_collection("faiss_index_files.files")
49
+ qa_db.drop_collection("faiss_index_files.chunks")
50
+ print("Old FAISS GridFS collections dropped from the QA cluster.")
51
+
52
+ # Only run when called directly
53
+ if __name__ == "__main__":
54
+ migrate_faiss_index()
utils/symbipredict_2022.csv ADDED
The diff for this file is too large to render. See raw diff
 
utils/translation.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # translation.py
2
+ from transformers import pipeline
3
+ import logging
4
+ import re
5
+ from collections import Counter
6
+
7
+ logger = logging.getLogger("translation-agent")
8
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
9
+
10
+ # To use lazy model loader
11
+ vi_en = None
12
+ zh_en = None
13
+
14
+ def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
15
+ """Collapse excessive repeated n-grams and repeated phrases with improved logic."""
16
+ if not s:
17
+ return s
18
+
19
+ # Collapse repeated spaces/newlines
20
+ s = re.sub(r"\s+", " ", s).strip()
21
+
22
+ # More aggressive repetition detection
23
+ # Check for simple word repetition (like "a lot of people do not" repeated)
24
+ words = s.split()
25
+ if len(words) > 20: # Only check if text is long enough
26
+ # Look for repeated sequences of 3-8 words
27
+ for seq_len in range(8, 2, -1):
28
+ if len(words) < seq_len * 3: # Need at least 3 repetitions
29
+ continue
30
+
31
+ # Check each possible starting position
32
+ for start in range(len(words) - seq_len * 2):
33
+ sequence = words[start:start + seq_len]
34
+ # Count how many times this sequence repeats
35
+ repeat_count = 1
36
+ pos = start + seq_len
37
+ while pos + seq_len <= len(words):
38
+ if words[pos:pos + seq_len] == sequence:
39
+ repeat_count += 1
40
+ pos += seq_len
41
+ else:
42
+ break
43
+
44
+ # If we found 3+ repetitions, remove the excess
45
+ if repeat_count >= 3:
46
+ # Keep only the first occurrence
47
+ new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:]
48
+ s = " ".join(new_words)
49
+ words = s.split()
50
+ break
51
+ else:
52
+ continue
53
+ break # Break outer loop if we found and fixed a repetition
54
+
55
+ # Additional cleanup for remaining patterns
56
+ # Remove consecutive identical word
57
+ tokens = s.split()
58
+ out = []
59
+ last = None
60
+ for t in tokens:
61
+ if last is None or t.lower() != last.lower():
62
+ out.append(t)
63
+ last = t
64
+ s = " ".join(out)
65
+
66
+ # Limit consecutive duplicate n-grams
67
+ for n in range(n_max, n_min - 1, -1):
68
+ pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
69
+ s = pattern.sub(r"\1", s)
70
+
71
+ return s
72
+
73
+
74
+ def _normalize_and_cap(s: str, cap: int = 512) -> str:
75
+ if not s:
76
+ return s
77
+ s = s.strip()
78
+ if len(s) > cap:
79
+ s = s[:cap]
80
+ return s
81
+
82
+
83
+ def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
84
+ if not s:
85
+ return False
86
+ tokens = [t.lower() for t in s.split()]
87
+ if len(tokens) < 10:
88
+ return False
89
+ counts = Counter(tokens)
90
+ top = counts.most_common(1)[0][1]
91
+ return (top / max(1, len(tokens))) >= threshold
92
+
93
+
94
+ def translate_query(text: str, lang_code: str) -> str:
95
+ global vi_en, zh_en
96
+
97
+ if not text or not text.strip():
98
+ return text
99
+
100
+ try:
101
+ if lang_code == "vi":
102
+ if vi_en is None:
103
+ logger.info("[Translation] Loading Vietnamese-English model...")
104
+ vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
105
+
106
+ # Limit input length to prevent model issues
107
+ input_text = text[:1000] if len(text) > 1000 else text
108
+ raw = vi_en(input_text, max_length=512)[0]["translation_text"]
109
+ cleaned = _dedupe_repeats(raw)
110
+ norm = _normalize_and_cap(cleaned, cap=512)
111
+
112
+ if _is_too_repetitive(norm) or len(norm.strip()) < 10:
113
+ logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text")
114
+ return text
115
+
116
+ logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...")
117
+ return norm
118
+
119
+ elif lang_code == "zh":
120
+ if zh_en is None:
121
+ logger.info("[Translation] Loading Chinese-English model...")
122
+ zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
123
+
124
+ # Limit input length to prevent model issues
125
+ input_text = text[:1000] if len(text) > 1000 else text
126
+ raw = zh_en(input_text, max_length=512)[0]["translation_text"]
127
+ cleaned = _dedupe_repeats(raw)
128
+ norm = _normalize_and_cap(cleaned, cap=512)
129
+
130
+ if _is_too_repetitive(norm) or len(norm.strip()) < 10:
131
+ logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text")
132
+ return text
133
+
134
+ logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...")
135
+ return norm
136
+
137
+ except Exception as e:
138
+ logger.error(f"[Translation] Translation failed for {lang_code}: {e}")
139
+ return text # Fallback to original text
140
+
141
+ return text
utils/vlm.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, logging, traceback, json, base64
2
+ from io import BytesIO
3
+ from PIL import Image
4
+ from .translation import translate_query
5
+ from gradio_client import Client, handle_file
6
+ import tempfile
7
+
8
+ logger = logging.getLogger("vlm-agent")
9
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True)
10
+
11
+ # ✅ Load Gradio client once
12
+ gr_client = None
13
+ def load_gradio_client():
14
+ global gr_client
15
+ if gr_client is None:
16
+ logger.info("[VLM] ⏳ Connecting to MedGEMMA Gradio Space...")
17
+ gr_client = Client("warshanks/medgemma-4b-it")
18
+ logger.info("[VLM] Gradio MedGEMMA client ready.")
19
+ return gr_client
20
+
21
+ def process_medical_image(base64_image: str, prompt: str = None, lang: str = "EN") -> str:
22
+ if not prompt:
23
+ prompt = "Describe and investigate any clinical findings from this medical image."
24
+ elif lang.upper() in {"VI", "ZH"}:
25
+ prompt = translate_query(prompt, lang.lower())
26
+
27
+ try:
28
+ # 1️⃣ Decode base64 image to temp file
29
+ image_data = base64.b64decode(base64_image)
30
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
31
+ tmp.write(image_data)
32
+ tmp.flush()
33
+ image_path = tmp.name
34
+
35
+ # 2️⃣ Send to Gradio MedGEMMA
36
+ client = load_gradio_client()
37
+ logger.info(f"[VLM] Sending prompt: {prompt}")
38
+ result = client.predict(
39
+ message={"text": prompt, "files": [handle_file(image_path)]},
40
+ param_2 = "You analyze medical images and report abnormalities, diseases with clear diagnostic insight.",
41
+ param_3=2048,
42
+ api_name="/chat"
43
+ )
44
+ if isinstance(result, str):
45
+ logger.info(f"[VLM] ✅ Response: {result}")
46
+ return result.strip()
47
+ else:
48
+ logger.warning(f"[VLM] ⚠️ Unexpected result type: {type(result)} — {result}")
49
+ return str(result)
50
+
51
+ except Exception as e:
52
+ logger.error(f"[VLM] ❌ Exception: {e}")
53
+ logger.error(f"[VLM] 🔍 Traceback:\n{traceback.format_exc()}")
54
+ return f"[VLM] ⚠️ Failed to process image: {e}"