LiamKhoaLe commited on
Commit
3685b45
·
1 Parent(s): 4bc06b1

Upd cooking-specs services. Upd multilinguability procesors

Browse files
api/chatbot.py CHANGED
@@ -7,7 +7,6 @@ from .config import gemini_flash_api_key
7
  from memory import MemoryManager
8
  from utils import translate_query
9
  from search import search_comprehensive
10
- # Safety guard removed - cooking tutor doesn't need medical safety checks
11
 
12
  logger = logging.getLogger("cooking-tutor")
13
 
@@ -66,9 +65,8 @@ class CookingTutorChatbot:
66
  cuisine: str = None,
67
  structured: bool = False,
68
  ) -> str:
69
- # Translate to English-centric search if needed
70
- if lang.upper() in {"VI", "ZH"}:
71
- user_query = translate_query(user_query, lang.lower())
72
 
73
  # Basic cooking relevance check
74
  cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
@@ -88,11 +86,13 @@ class CookingTutorChatbot:
88
 
89
  if search_mode:
90
  try:
 
91
  search_context, url_mapping, source_aggregation = search_comprehensive(
92
- f"cooking technique tutorial: {user_query}",
93
  num_results=12,
94
  target_language=lang,
95
- include_videos=bool(video_mode)
 
96
  )
97
  if video_mode and source_aggregation:
98
  video_results = source_aggregation.get('sources', []) or []
@@ -170,11 +170,24 @@ class CookingTutorChatbot:
170
  if user_id:
171
  self.memory.add_exchange(user_id, user_query, response, lang=lang)
172
 
 
 
 
 
 
 
173
  if video_mode and video_results:
174
- return {
175
- 'text': response.strip(),
176
- 'videos': video_results
177
- }
 
 
 
 
 
 
 
178
  return response.strip()
179
 
180
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
 
7
  from memory import MemoryManager
8
  from utils import translate_query
9
  from search import search_comprehensive
 
10
 
11
  logger = logging.getLogger("cooking-tutor")
12
 
 
65
  cuisine: str = None,
66
  structured: bool = False,
67
  ) -> str:
68
+ # Keep original language for native search - no translation needed
69
+ # The search engines now support native language sources
 
70
 
71
  # Basic cooking relevance check
72
  cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
 
86
 
87
  if search_mode:
88
  try:
89
+ # Use native language search for better results
90
  search_context, url_mapping, source_aggregation = search_comprehensive(
91
+ user_query, # Use original query without English prefix
92
  num_results=12,
93
  target_language=lang,
94
+ include_videos=bool(video_mode),
95
+ include_images=True # Always include images for visual appeal
96
  )
97
  if video_mode and source_aggregation:
98
  video_results = source_aggregation.get('sources', []) or []
 
170
  if user_id:
171
  self.memory.add_exchange(user_id, user_query, response, lang=lang)
172
 
173
+ # Prepare response with media
174
+ response_data = {
175
+ 'text': response.strip()
176
+ }
177
+
178
+ # Add videos if available
179
  if video_mode and video_results:
180
+ response_data['videos'] = video_results
181
+
182
+ # Add images if available
183
+ if source_aggregation and 'images' in source_aggregation:
184
+ images = source_aggregation['images']
185
+ if images:
186
+ response_data['images'] = images[:3] # Limit to 3 images
187
+
188
+ # Return structured response if we have media, otherwise just text
189
+ if len(response_data) > 1:
190
+ return response_data
191
  return response.strip()
192
 
193
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
api/routes.py CHANGED
@@ -59,13 +59,15 @@ async def chat_endpoint(req: Request):
59
  )
60
  elapsed = time.time() - start
61
 
62
- # Handle response format (might be string or dict with videos)
63
  if isinstance(answer, dict):
64
  response_text = answer.get('text', '')
65
  video_data = answer.get('videos', [])
 
66
  else:
67
  response_text = answer
68
  video_data = []
 
69
 
70
  # Final response
71
  response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
@@ -74,6 +76,10 @@ async def chat_endpoint(req: Request):
74
  if video_data:
75
  response_data["videos"] = video_data
76
 
 
 
 
 
77
  return JSONResponse(response_data)
78
 
79
  except Exception as e:
 
59
  )
60
  elapsed = time.time() - start
61
 
62
+ # Handle response format (might be string or dict with videos/images)
63
  if isinstance(answer, dict):
64
  response_text = answer.get('text', '')
65
  video_data = answer.get('videos', [])
66
+ image_data = answer.get('images', [])
67
  else:
68
  response_text = answer
69
  video_data = []
70
+ image_data = []
71
 
72
  # Final response
73
  response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
 
76
  if video_data:
77
  response_data["videos"] = video_data
78
 
79
+ # Include image data if available
80
+ if image_data:
81
+ response_data["images"] = image_data
82
+
83
  return JSONResponse(response_data)
84
 
85
  except Exception as e:
search/coordinator.py CHANGED
@@ -7,6 +7,7 @@ from .engines.duckduckgo import DuckDuckGoEngine
7
  from .engines.cooking import CookingSearchEngine
8
  from .engines.multilingual import MultilingualCookingEngine
9
  from .engines.video import VideoSearchEngine
 
10
  from .extractors.content import ContentExtractor
11
  from .processors.cooking import CookingSearchProcessor
12
  from .processors.language import LanguageProcessor
@@ -27,6 +28,7 @@ class SearchCoordinator:
27
  self.cooking_engine = CookingSearchEngine()
28
  self.multilingual_engine = MultilingualCookingEngine()
29
  self.video_engine = VideoSearchEngine()
 
30
 
31
  # Initialize processors
32
  self.content_extractor = ContentExtractor()
@@ -105,7 +107,7 @@ class SearchCoordinator:
105
  return summary, url_mapping
106
 
107
  def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
108
- """Search using multilingual medical engine"""
109
  try:
110
  if language:
111
  results = self.multilingual_engine.search_by_language(query, language, num_results)
@@ -454,6 +456,31 @@ class SearchCoordinator:
454
 
455
  logger.info(f"Video search completed: {len(video_results)} videos found")
456
  return video_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
459
  """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
 
7
  from .engines.cooking import CookingSearchEngine
8
  from .engines.multilingual import MultilingualCookingEngine
9
  from .engines.video import VideoSearchEngine
10
+ from .engines.image import ImageSearchEngine
11
  from .extractors.content import ContentExtractor
12
  from .processors.cooking import CookingSearchProcessor
13
  from .processors.language import LanguageProcessor
 
28
  self.cooking_engine = CookingSearchEngine()
29
  self.multilingual_engine = MultilingualCookingEngine()
30
  self.video_engine = VideoSearchEngine()
31
+ self.image_engine = ImageSearchEngine()
32
 
33
  # Initialize processors
34
  self.content_extractor = ContentExtractor()
 
107
  return summary, url_mapping
108
 
109
  def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
110
+ """Search using multilingual cooking engine"""
111
  try:
112
  if language:
113
  results = self.multilingual_engine.search_by_language(query, language, num_results)
 
456
 
457
  logger.info(f"Video search completed: {len(video_results)} videos found")
458
  return video_results
459
+
460
+ def image_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
461
+ """Search for cooking-related images"""
462
+ logger.info(f"Image search for: {query} (target: {target_language})")
463
+
464
+ # Detect language if not provided
465
+ if not target_language:
466
+ target_language = self.language_processor.detect_language(query)
467
+
468
+ # Map language codes
469
+ lang_mapping = {
470
+ 'EN': 'en',
471
+ 'VI': 'vi',
472
+ 'ZH': 'zh',
473
+ 'en': 'en',
474
+ 'vi': 'vi',
475
+ 'zh': 'zh'
476
+ }
477
+ search_language = lang_mapping.get(target_language, 'en')
478
+
479
+ # Search for images
480
+ image_results = self.image_engine.search_cooking_images(query, num_results, search_language)
481
+
482
+ logger.info(f"Image search completed: {len(image_results)} images found")
483
+ return image_results
484
 
485
  def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
486
  """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
search/engines/duckduckgo.py CHANGED
@@ -143,11 +143,11 @@ class DuckDuckGoEngine:
143
  return ' '.join(words[:3]) # Max 3 words
144
 
145
  def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
146
- """Filter out irrelevant sources like generic health pages, quizzes, etc."""
147
  import re
148
  filtered = []
149
 
150
- # Only exclude obvious non-medical content
151
  exclude_patterns = [
152
  r'/quiz$', # Quiz pages (end of URL)
153
  r'/test$', # Test pages (end of URL)
@@ -325,7 +325,7 @@ class DuckDuckGoEngine:
325
  'format': 'json',
326
  'no_html': '1',
327
  'skip_disambig': '1',
328
- 't': 'MedicalChatbot'
329
  }
330
 
331
  response = self.session.get(url, params=params, timeout=self.timeout)
 
143
  return ' '.join(words[:3]) # Max 3 words
144
 
145
  def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
146
+ """Filter out irrelevant sources like generic pages, quizzes, etc."""
147
  import re
148
  filtered = []
149
 
150
+ # Only exclude obvious non-cooking content
151
  exclude_patterns = [
152
  r'/quiz$', # Quiz pages (end of URL)
153
  r'/test$', # Test pages (end of URL)
 
325
  'format': 'json',
326
  'no_html': '1',
327
  'skip_disambig': '1',
328
+ 't': 'CookingTutor'
329
  }
330
 
331
  response = self.session.get(url, params=params, timeout=self.timeout)
search/engines/image.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+ import re
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class ImageSearchEngine:
11
+ """Search engine for cooking-related images"""
12
+
13
+ def __init__(self, timeout: int = 15):
14
+ self.session = requests.Session()
15
+ self.session.headers.update({
16
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
17
+ })
18
+ self.timeout = timeout
19
+
20
+ def search_cooking_images(self, query: str, num_results: int = 3, language: str = "en") -> List[Dict]:
21
+ """Search for cooking-related images"""
22
+ results = []
23
+
24
+ # Try multiple image search strategies
25
+ strategies = [
26
+ self._search_google_images,
27
+ self._search_bing_images,
28
+ self._search_unsplash
29
+ ]
30
+
31
+ for strategy in strategies:
32
+ try:
33
+ strategy_results = strategy(query, num_results, language)
34
+ if strategy_results:
35
+ results.extend(strategy_results)
36
+ logger.info(f"Image search found {len(strategy_results)} results")
37
+ if len(results) >= num_results:
38
+ break
39
+ except Exception as e:
40
+ logger.warning(f"Image search strategy failed: {e}")
41
+ continue
42
+
43
+ return results[:num_results]
44
+
45
+ def _search_google_images(self, query: str, num_results: int, language: str) -> List[Dict]:
46
+ """Search Google Images for cooking content"""
47
+ try:
48
+ # Add cooking context to improve relevance
49
+ cooking_query = f"{query} recipe cooking food dish"
50
+
51
+ url = "https://www.google.com/search"
52
+ params = {
53
+ 'q': cooking_query,
54
+ 'tbm': 'isch', # Image search
55
+ 'hl': language,
56
+ 'safe': 'active'
57
+ }
58
+
59
+ response = self.session.get(url, params=params, timeout=self.timeout)
60
+ response.raise_for_status()
61
+
62
+ soup = BeautifulSoup(response.content, 'html.parser')
63
+ results = []
64
+
65
+ # Find image containers
66
+ image_containers = soup.find_all('div', class_='islrc')
67
+
68
+ for container in image_containers[:num_results]:
69
+ try:
70
+ # Extract image URL
71
+ img_tag = container.find('img')
72
+ if not img_tag:
73
+ continue
74
+
75
+ img_url = img_tag.get('src') or img_tag.get('data-src')
76
+ if not img_url or not img_url.startswith('http'):
77
+ continue
78
+
79
+ # Extract title/alt text
80
+ title = img_tag.get('alt', '') or img_tag.get('title', '')
81
+
82
+ # Extract source URL
83
+ link_tag = container.find('a')
84
+ source_url = link_tag.get('href', '') if link_tag else ''
85
+
86
+ results.append({
87
+ 'url': img_url,
88
+ 'title': title,
89
+ 'source_url': source_url,
90
+ 'source': 'google_images',
91
+ 'type': 'image'
92
+ })
93
+
94
+ except Exception as e:
95
+ logger.debug(f"Error parsing Google image: {e}")
96
+ continue
97
+
98
+ return results
99
+
100
+ except Exception as e:
101
+ logger.warning(f"Google Images search failed: {e}")
102
+ return []
103
+
104
+ def _search_bing_images(self, query: str, num_results: int, language: str) -> List[Dict]:
105
+ """Search Bing Images for cooking content"""
106
+ try:
107
+ cooking_query = f"{query} recipe cooking food"
108
+
109
+ url = "https://www.bing.com/images/search"
110
+ params = {
111
+ 'q': cooking_query,
112
+ 'qft': '+filterui:imagesize-large', # Large images
113
+ 'form': 'HDRSC2',
114
+ 'first': '1',
115
+ 'count': num_results
116
+ }
117
+
118
+ response = self.session.get(url, params=params, timeout=self.timeout)
119
+ response.raise_for_status()
120
+
121
+ soup = BeautifulSoup(response.content, 'html.parser')
122
+ results = []
123
+
124
+ # Find image containers
125
+ image_containers = soup.find_all('div', class_='img_cont')
126
+
127
+ for container in image_containers[:num_results]:
128
+ try:
129
+ img_tag = container.find('img')
130
+ if not img_tag:
131
+ continue
132
+
133
+ img_url = img_tag.get('src') or img_tag.get('data-src')
134
+ if not img_url or not img_url.startswith('http'):
135
+ continue
136
+
137
+ title = img_tag.get('alt', '') or img_tag.get('title', '')
138
+
139
+ results.append({
140
+ 'url': img_url,
141
+ 'title': title,
142
+ 'source_url': '',
143
+ 'source': 'bing_images',
144
+ 'type': 'image'
145
+ })
146
+
147
+ except Exception as e:
148
+ logger.debug(f"Error parsing Bing image: {e}")
149
+ continue
150
+
151
+ return results
152
+
153
+ except Exception as e:
154
+ logger.warning(f"Bing Images search failed: {e}")
155
+ return []
156
+
157
+ def _search_unsplash(self, query: str, num_results: int, language: str) -> List[Dict]:
158
+ """Search Unsplash for high-quality cooking images"""
159
+ try:
160
+ cooking_query = f"{query} food cooking recipe"
161
+
162
+ url = "https://unsplash.com/s/photos/" + cooking_query.replace(' ', '-')
163
+
164
+ response = self.session.get(url, timeout=self.timeout)
165
+ response.raise_for_status()
166
+
167
+ soup = BeautifulSoup(response.content, 'html.parser')
168
+ results = []
169
+
170
+ # Find image containers
171
+ image_containers = soup.find_all('figure')
172
+
173
+ for container in image_containers[:num_results]:
174
+ try:
175
+ img_tag = container.find('img')
176
+ if not img_tag:
177
+ continue
178
+
179
+ img_url = img_tag.get('src') or img_tag.get('data-src')
180
+ if not img_url or not img_url.startswith('http'):
181
+ continue
182
+
183
+ title = img_tag.get('alt', '') or img_tag.get('title', '')
184
+
185
+ # Get source URL
186
+ link_tag = container.find('a')
187
+ source_url = link_tag.get('href', '') if link_tag else ''
188
+ if source_url and not source_url.startswith('http'):
189
+ source_url = 'https://unsplash.com' + source_url
190
+
191
+ results.append({
192
+ 'url': img_url,
193
+ 'title': title,
194
+ 'source_url': source_url,
195
+ 'source': 'unsplash',
196
+ 'type': 'image'
197
+ })
198
+
199
+ except Exception as e:
200
+ logger.debug(f"Error parsing Unsplash image: {e}")
201
+ continue
202
+
203
+ return results
204
+
205
+ except Exception as e:
206
+ logger.warning(f"Unsplash search failed: {e}")
207
+ return []
208
+
209
+ def _filter_cooking_relevance(self, images: List[Dict], query: str) -> List[Dict]:
210
+ """Filter images for cooking relevance"""
211
+ cooking_keywords = [
212
+ 'food', 'cooking', 'recipe', 'dish', 'meal', 'ingredient', 'kitchen',
213
+ 'chef', 'bake', 'cook', 'preparation', 'cuisine', 'delicious', 'tasty'
214
+ ]
215
+
216
+ relevant_images = []
217
+ query_lower = query.lower()
218
+
219
+ for image in images:
220
+ title = image.get('title', '').lower()
221
+
222
+ # Check if title contains cooking keywords or query terms
223
+ is_relevant = (
224
+ any(keyword in title for keyword in cooking_keywords) or
225
+ any(word in title for word in query_lower.split() if len(word) > 3)
226
+ )
227
+
228
+ if is_relevant:
229
+ relevant_images.append(image)
230
+
231
+ return relevant_images
search/engines/video.py CHANGED
@@ -50,13 +50,6 @@ class VideoSearchEngine:
50
  'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
51
  'base_url': 'https://www.youtube.com'
52
  },
53
- {
54
- 'name': 'vinmec_videos',
55
- 'search_url': 'https://www.vinmec.com/vi/tim-kiem',
56
- 'params': {'q': ''},
57
- 'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
58
- 'base_url': 'https://www.vinmec.com'
59
- }
60
  ],
61
  'zh': [
62
  {
@@ -87,8 +80,8 @@ class VideoSearchEngine:
87
  q = re.sub(r"\s+", " ", q)
88
  return q.strip()
89
 
90
- def _is_valid_medical_video(self, result: Dict, query: str) -> bool:
91
- """Check if video is medically relevant and has valid URL"""
92
  url = result.get('url', '')
93
  title = result.get('title', '')
94
 
@@ -96,25 +89,23 @@ class VideoSearchEngine:
96
  if 'results?search_query=' in url:
97
  return False
98
 
99
- # Skip non-YouTube URLs that aren't medical platforms
100
- if 'youtube.com' not in url and not any(med in url for med in ['medscape.com', 'vinmec.com', 'haodf.com']):
101
  return False
102
 
103
- # Check if title contains medical keywords or query terms
104
  title_lower = title.lower()
105
  query_lower = query.lower()
106
 
107
- medical_keywords = [
108
- 'medical', 'health', 'doctor', 'treatment', 'diagnosis',
109
- 'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
110
- 'disease', 'condition', 'healthcare', 'physician'
111
  ]
112
 
113
- # Must contain medical keywords or query terms
114
- has_medical = any(keyword in title_lower for keyword in medical_keywords)
115
  has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
116
 
117
- return has_medical or has_query
118
 
119
  def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
120
  """Search platform with retry logic and better error handling"""
@@ -130,9 +121,9 @@ class VideoSearchEngine:
130
  return []
131
 
132
  def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
133
- """Search for medical videos across platforms with deduplication and medical filtering"""
134
  query = self._normalize_query(query)
135
- logger.info(f"Searching for medical videos: {query} (language: {language})")
136
 
137
  results = []
138
  seen_urls = set() # Track URLs to avoid duplicates
@@ -152,7 +143,7 @@ class VideoSearchEngine:
152
  logger.warning(f"No results from {platform['name']}")
153
  continue
154
 
155
- # Filter out duplicates and non-medical content
156
  for result in platform_results:
157
  url = result.get('url', '')
158
  video_id = self._extract_video_id(url)
@@ -161,8 +152,8 @@ class VideoSearchEngine:
161
  if url in seen_urls or (video_id and video_id in seen_video_ids):
162
  continue
163
 
164
- # Check if it's a valid medical video (less strict for more results)
165
- if self._is_valid_medical_video(result, query):
166
  seen_urls.add(url)
167
  if video_id:
168
  seen_video_ids.add(video_id)
@@ -192,7 +183,7 @@ class VideoSearchEngine:
192
 
193
  if (url not in seen_urls and
194
  video_id not in seen_video_ids and
195
- self._is_valid_medical_video(result, query)):
196
  seen_urls.add(url)
197
  if video_id:
198
  seen_video_ids.add(video_id)
@@ -373,8 +364,8 @@ class VideoSearchEngine:
373
  fallback_videos = {
374
  'en': [
375
  {
376
- 'url': 'https://www.youtube.com/results?search_query=medical+' + quote(query),
377
- 'title': f'Medical Videos: {query}',
378
  'platform': 'youtube_fallback',
379
  'type': 'video',
380
  'source': 'youtube'
 
50
  'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
51
  'base_url': 'https://www.youtube.com'
52
  },
 
 
 
 
 
 
 
53
  ],
54
  'zh': [
55
  {
 
80
  q = re.sub(r"\s+", " ", q)
81
  return q.strip()
82
 
83
+ def _is_valid_cooking_video(self, result: Dict, query: str) -> bool:
84
+ """Check if video is cooking-relevant and has valid URL"""
85
  url = result.get('url', '')
86
  title = result.get('title', '')
87
 
 
89
  if 'results?search_query=' in url:
90
  return False
91
 
92
+ # Skip non-YouTube URLs that aren't cooking platforms
93
+ if 'youtube.com' not in url and not any(cook in url for cook in ['allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com']):
94
  return False
95
 
96
+ # Check if title contains cooking keywords or query terms
97
  title_lower = title.lower()
98
  query_lower = query.lower()
99
 
100
+ cooking_keywords = [
101
+ 'recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner'
 
 
102
  ]
103
 
104
+ # Must contain cooking keywords or query terms
105
+ has_cooking = any(keyword in title_lower for keyword in cooking_keywords)
106
  has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
107
 
108
+ return has_cooking or has_query
109
 
110
  def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
111
  """Search platform with retry logic and better error handling"""
 
121
  return []
122
 
123
  def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
124
+ """Search for cooking videos across platforms with deduplication and cooking filtering"""
125
  query = self._normalize_query(query)
126
+ logger.info(f"Searching for cooking videos: {query} (language: {language})")
127
 
128
  results = []
129
  seen_urls = set() # Track URLs to avoid duplicates
 
143
  logger.warning(f"No results from {platform['name']}")
144
  continue
145
 
146
+ # Filter out duplicates and non-cooking content
147
  for result in platform_results:
148
  url = result.get('url', '')
149
  video_id = self._extract_video_id(url)
 
152
  if url in seen_urls or (video_id and video_id in seen_video_ids):
153
  continue
154
 
155
+ # Check if it's a valid cooking video (less strict for more results)
156
+ if self._is_valid_cooking_video(result, query):
157
  seen_urls.add(url)
158
  if video_id:
159
  seen_video_ids.add(video_id)
 
183
 
184
  if (url not in seen_urls and
185
  video_id not in seen_video_ids and
186
+ self._is_valid_cooking_video(result, query)):
187
  seen_urls.add(url)
188
  if video_id:
189
  seen_video_ids.add(video_id)
 
364
  fallback_videos = {
365
  'en': [
366
  {
367
+ 'url': 'https://www.youtube.com/results?search_query=cooking+' + quote(query),
368
+ 'title': f'Cooking Videos: {query}',
369
  'platform': 'youtube_fallback',
370
  'type': 'video',
371
  'source': 'youtube'
search/processors/cooking.py CHANGED
@@ -30,7 +30,7 @@ class CookingSearchProcessor:
30
  ],
31
  'dietary': [
32
  'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
33
- 'healthy', 'low-carb', 'low-fat', 'protein', 'fiber'
34
  ],
35
  'meal_types': [
36
  'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
@@ -272,12 +272,12 @@ class CookingSearchProcessor:
272
 
273
  # Add topic header
274
  topic_headers = {
275
- 'recipes': "**🍳 Recipes and Instructions:**",
276
- 'techniques': "**👨‍🍳 Cooking Techniques:**",
277
- 'ingredients': "**🥘 Ingredients and Substitutions:**",
278
- 'equipment': "**🔪 Equipment and Tools:**",
279
- 'tips_tricks': "**💡 Tips and Tricks:**",
280
- 'general': "**📚 General Information:**"
281
  }
282
 
283
  header = topic_headers.get(topic, "**Information:**")
 
30
  ],
31
  'dietary': [
32
  'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
33
+ 'healthy', 'low-carb', 'low-fat', 'protein', 'fiber', 'nutritious', 'balanced'
34
  ],
35
  'meal_types': [
36
  'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
 
272
 
273
  # Add topic header
274
  topic_headers = {
275
+ 'recipes': "**Recipes and Instructions:**",
276
+ 'techniques': "**Cooking Techniques:**",
277
+ 'ingredients': "**Ingredients and Substitutions:**",
278
+ 'equipment': "**Equipment and Tools:**",
279
+ 'tips_tricks': "**Tips and Tricks:**",
280
+ 'general': "**General Information:**"
281
  }
282
 
283
  header = topic_headers.get(topic, "**Information:**")
search/processors/enhanced.py CHANGED
@@ -92,7 +92,7 @@ class EnhancedContentProcessor:
92
  return comprehensive_summary, reference_mapping
93
 
94
  def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
95
- """Extract structured information by medical categories"""
96
  structured_info = defaultdict(list)
97
 
98
  for source in sources:
@@ -100,8 +100,8 @@ class EnhancedContentProcessor:
100
  if not content:
101
  continue
102
 
103
- # Extract information by medical categories
104
- for category, patterns in self.medical_patterns.items():
105
  extracted_info = self._extract_category_info(content, patterns, category, user_query)
106
  if extracted_info:
107
  structured_info[category].append({
 
92
  return comprehensive_summary, reference_mapping
93
 
94
  def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
95
+ """Extract structured information by cooking categories"""
96
  structured_info = defaultdict(list)
97
 
98
  for source in sources:
 
100
  if not content:
101
  continue
102
 
103
+ # Extract information by cooking categories
104
+ for category, patterns in self.cooking_patterns.items():
105
  extracted_info = self._extract_category_info(content, patterns, category, user_query)
106
  if extracted_info:
107
  structured_info[category].append({
search/processors/language.py CHANGED
@@ -10,57 +10,59 @@ logger = logging.getLogger(__name__)
10
  DetectorFactory.seed = 0
11
 
12
  class LanguageProcessor:
13
- """Process and enhance queries for multilingual medical search"""
14
 
15
  def __init__(self):
16
- # Medical keywords in different languages
17
- self.medical_keywords = {
18
  'en': [
19
- 'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
20
- 'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
21
- 'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
22
- 'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
23
- 'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
24
- 'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
25
- 'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
26
- 'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
27
- 'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
28
  ],
29
  'vi': [
30
- 'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
31
- 'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
32
- 'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác ', 'y tế',
33
- 'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
34
- 'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
35
- 'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
36
- 'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
37
- 'em ', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
38
- 'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
39
  ],
40
  'zh': [
41
- '症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
42
- '药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
43
- '医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
44
- '背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
45
- '睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
46
- '婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
47
- '手术', '医院', '诊所'
 
 
48
  ]
49
  }
50
 
51
  # Language-specific search enhancements
52
  self.language_enhancements = {
53
  'vi': {
54
- 'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
55
- 'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác ']
56
  },
57
  'zh': {
58
- 'common_terms': ['是什么', '原因', '治疗方法', '症状'],
59
- 'medical_context': ['医疗', '健康', '医院', '医生']
60
  },
61
  'en': {
62
- 'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
63
- 'medical_context': ['medical', 'health', 'hospital', 'doctor']
64
  }
65
  }
66
 
@@ -136,16 +138,16 @@ class LanguageProcessor:
136
  """Enhance query for a specific language"""
137
  enhancements = self.language_enhancements.get(language, {})
138
  common_terms = enhancements.get('common_terms', [])
139
- medical_context = enhancements.get('medical_context', [])
140
 
141
- # Check if query already contains medical context
142
  query_lower = query.lower()
143
- has_medical_context = any(term in query_lower for term in medical_context)
144
 
145
- # If no medical context, add it
146
- if not has_medical_context and medical_context:
147
- # Add the most relevant medical context term
148
- query += f" {medical_context[0]}"
149
 
150
  # Check if query is a question and add relevant terms
151
  if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
@@ -155,62 +157,58 @@ class LanguageProcessor:
155
  return query.strip()
156
 
157
  def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
158
- """Simple keyword-based translation for medical terms"""
159
  # This is a basic implementation - in production, you'd use a proper translation service
160
 
161
- # Medical term translations
162
  translations = {
163
  ('vi', 'en'): {
164
- 'triệu chứng': 'symptoms',
165
- 'đau': 'pain',
166
- 'đau đầu': 'headache',
167
- 'sốt': 'fever',
168
- 'ho': 'cough',
169
- 'điều trị': 'treatment',
170
- 'thuốc': 'medicine',
171
- 'bệnh': 'disease',
172
- 'bác sĩ': 'doctor',
173
- 'sức khỏe': 'health',
174
- 'bệnh viện': 'hospital'
175
  },
176
  ('zh', 'en'): {
177
- '症状': 'symptoms',
178
- '疼痛': 'pain',
179
- '头痛': 'headache',
180
- '发烧': 'fever',
181
- '咳嗽': 'cough',
182
- '治疗': 'treatment',
183
- '药物': 'medicine',
184
- '疾病': 'disease',
185
- '医生': 'doctor',
186
- '健康': 'health',
187
- '医院': 'hospital'
188
  },
189
  ('en', 'vi'): {
190
- 'symptoms': 'triệu chứng',
191
- 'pain': 'đau',
192
- 'headache': 'đau đầu',
193
- 'fever': 'sốt',
194
- 'cough': 'ho',
195
- 'treatment': 'điều trị',
196
- 'medicine': 'thuốc',
197
- 'disease': 'bệnh',
198
- 'doctor': 'bác sĩ',
199
- 'health': 'sức khỏe',
200
- 'hospital': 'bệnh viện'
201
  },
202
  ('en', 'zh'): {
203
- 'symptoms': '症状',
204
- 'pain': '疼痛',
205
- 'headache': '头痛',
206
- 'fever': '发烧',
207
- 'cough': '咳嗽',
208
- 'treatment': '治疗',
209
- 'medicine': '药物',
210
- 'disease': '疾病',
211
- 'doctor': '医生',
212
- 'health': '健康',
213
- 'hospital': '医院'
214
  }
215
  }
216
 
@@ -223,12 +221,12 @@ class LanguageProcessor:
223
 
224
  return translated_query
225
 
226
- def get_medical_relevance_score(self, text: str, language: str) -> float:
227
- """Calculate medical relevance score for text in a specific language"""
228
  if not text:
229
  return 0.0
230
 
231
- keywords = self.medical_keywords.get(language, [])
232
  if not keywords:
233
  return 0.0
234
 
 
10
  DetectorFactory.seed = 0
11
 
12
  class LanguageProcessor:
13
+ """Process and enhance queries for multilingual cooking search"""
14
 
15
  def __init__(self):
16
+ # Cooking keywords in different languages
17
+ self.cooking_keywords = {
18
  'en': [
19
+ 'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
20
+ 'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
21
+ 'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
22
+ 'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
23
+ 'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
24
+ 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
25
+ 'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
26
+ 'substitution', 'alternative', 'variation', 'modification', 'adaptation',
27
+ 'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
28
  ],
29
  'vi': [
30
+ 'công thức', 'nấu ăn', 'nướng', 'rang', 'nướng vỉ', 'chiên', 'luộc', 'hấp',
31
+ 'nguyên liệu', 'gia vị', 'thảo mộc', 'nước sốt', 'tẩm ướp', 'dressing',
32
+ 'kỹ thuật', 'phương pháp', 'nhiệt độ', 'thời gian', 'chuẩn bị', 'thời gian nấu',
33
+ ' nướng', 'bếp', 'vỉ nướng', 'chảo', 'nồi', 'dao', 'cắt',
34
+ 'chay', 'thuần chay', 'không gluten', 'không sữa', 'keto', 'paleo',
35
+ 'khai vị', 'món chính', 'tráng miệng', 'sáng', 'trưa', 'tối',
36
+ 'ẩm thực', 'ý', 'trung', 'mexico', 'pháp', 'ấn', 'thái',
37
+ 'thay thế', 'biến tấu', 'sửa đổi', 'thích ứng',
38
+ 'khắc phục', 'mẹo', 'thủ thuật', 'lỗi thường gặp'
39
  ],
40
  'zh': [
41
+ '食谱', '烹饪', '烘焙', '', '烧烤', '', '', '',
42
+ '食材', '调料', '香料', '香草', '酱汁', '腌料', '调料',
43
+ '技巧', '方法', '温度', '时间', '准备', '烹饪时间',
44
+ '烤箱', '炉灶', '烤架', '平底锅', '', '', '',
45
+ '素食', '纯素', '无麸质', '无乳制品', '生酮', '古法',
46
+ '开胃菜', '主菜', '甜点', '早餐', '午餐', '晚餐',
47
+ '菜系', '意大利', '中国', '墨西哥', '法国', '印度', '泰国',
48
+ '替代', '变化', '修改', '适应',
49
+ '故障排除', '技巧', '窍门', '常见错误'
50
  ]
51
  }
52
 
53
  # Language-specific search enhancements
54
  self.language_enhancements = {
55
  'vi': {
56
+ 'common_terms': ['là gì', 'cách nấu', 'công thức', 'nguyên liệu'],
57
+ 'cooking_context': ['nấu ăn', 'ẩm thực', 'bếp', 'đầu bếp']
58
  },
59
  'zh': {
60
+ 'common_terms': ['是什么', '怎么做', '食谱', '食材'],
61
+ 'cooking_context': ['烹饪', '美食', '厨房', '厨师']
62
  },
63
  'en': {
64
+ 'common_terms': ['what is', 'how to cook', 'recipe', 'ingredients'],
65
+ 'cooking_context': ['cooking', 'culinary', 'kitchen', 'chef']
66
  }
67
  }
68
 
 
138
  """Enhance query for a specific language"""
139
  enhancements = self.language_enhancements.get(language, {})
140
  common_terms = enhancements.get('common_terms', [])
141
+ cooking_context = enhancements.get('cooking_context', [])
142
 
143
+ # Check if query already contains cooking context
144
  query_lower = query.lower()
145
+ has_cooking_context = any(term in query_lower for term in cooking_context)
146
 
147
+ # If no cooking context, add it
148
+ if not has_cooking_context and cooking_context:
149
+ # Add the most relevant cooking context term
150
+ query += f" {cooking_context[0]}"
151
 
152
  # Check if query is a question and add relevant terms
153
  if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
 
157
  return query.strip()
158
 
159
  def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
160
+ """Simple keyword-based translation for cooking terms"""
161
  # This is a basic implementation - in production, you'd use a proper translation service
162
 
163
+ # Cooking term translations
164
  translations = {
165
  ('vi', 'en'): {
166
+ 'công thức': 'recipe',
167
+ 'nấu ăn': 'cooking',
168
+ 'nguyên liệu': 'ingredients',
169
+ 'gia vị': 'seasoning',
170
+ 'kỹ thuật': 'technique',
171
+ 'nướng': 'baking',
172
+ 'chiên': 'frying',
173
+ 'luộc': 'boiling',
174
+ 'hấp': 'steaming',
175
+ 'nước sốt': 'sauce'
 
176
  },
177
  ('zh', 'en'): {
178
+ '食谱': 'recipe',
179
+ '烹饪': 'cooking',
180
+ '食材': 'ingredients',
181
+ '调料': 'seasoning',
182
+ '技巧': 'technique',
183
+ '烘焙': 'baking',
184
+ '': 'frying',
185
+ '': 'boiling',
186
+ '': 'steaming',
187
+ '酱汁': 'sauce'
 
188
  },
189
  ('en', 'vi'): {
190
+ 'recipe': 'công thức',
191
+ 'cooking': 'nấu ăn',
192
+ 'ingredients': 'nguyên liệu',
193
+ 'seasoning': 'gia vị',
194
+ 'technique': 'kỹ thuật',
195
+ 'baking': 'nướng',
196
+ 'frying': 'chiên',
197
+ 'boiling': 'luộc',
198
+ 'steaming': 'hấp',
199
+ 'sauce': 'nước sốt'
 
200
  },
201
  ('en', 'zh'): {
202
+ 'recipe': '食谱',
203
+ 'cooking': '烹饪',
204
+ 'ingredients': '食材',
205
+ 'seasoning': '调料',
206
+ 'technique': '技巧',
207
+ 'baking': '烘焙',
208
+ 'frying': '',
209
+ 'boiling': '',
210
+ 'steaming': '',
211
+ 'sauce': '酱汁'
 
212
  }
213
  }
214
 
 
221
 
222
  return translated_query
223
 
224
+ def get_cooking_relevance_score(self, text: str, language: str) -> float:
225
+ """Calculate cooking relevance score for text in a specific language"""
226
  if not text:
227
  return 0.0
228
 
229
+ keywords = self.cooking_keywords.get(language, [])
230
  if not keywords:
231
  return 0.0
232
 
search/processors/sources.py CHANGED
@@ -13,9 +13,9 @@ class SourceAggregator:
13
  # (Removed credibility scoring; keep placeholder map for future use)
14
  self.source_credibility = {
15
  # English sources
16
- 'mayoclinic.org': 0.95,
17
- 'webmd.com': 0.90,
18
- 'healthline.com': 0.88,
19
  'medlineplus.gov': 0.95,
20
  'nih.gov': 0.98,
21
  'cdc.gov': 0.98,
@@ -29,7 +29,7 @@ class SourceAggregator:
29
  'hellobacsi.com': 0.85,
30
  'alobacsi.com': 0.82,
31
  'vinmec.com': 0.88,
32
- 'tamanhhospital.vn': 0.85,
33
  'medlatec.vn': 0.83,
34
  'suckhoedoisong.vn': 0.90,
35
  'viendinhduong.vn': 0.87,
@@ -40,7 +40,7 @@ class SourceAggregator:
40
  'chunyuyisheng.com': 0.84,
41
  'xywy.com': 0.82,
42
  'jiankang.com': 0.80,
43
- 'familydoctor.com.cn': 0.85,
44
 
45
  # Video platforms
46
  'youtube.com': 0.70,
@@ -50,8 +50,8 @@ class SourceAggregator:
50
  # Source type classification
51
  self.source_types = {
52
  'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
53
- 'hospital': ['mayoclinic.org', 'vinmec.com', 'tamanhhospital.vn'],
54
- 'commercial': ['webmd.com', 'healthline.com', 'hellobacsi.com'],
55
  'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
56
  'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
57
  'video': ['youtube.com', 'medscape.com']
@@ -325,7 +325,7 @@ class SourceAggregator:
325
  # Create type indicator
326
  type_icons = {
327
  'academic': '🎓',
328
- 'hospital': '🏥',
329
  'government': '🏛️',
330
  'commercial': '💼',
331
  'professional': '👨‍⚕️',
 
13
  # (Removed credibility scoring; keep placeholder map for future use)
14
  self.source_credibility = {
15
  # English sources
16
+ 'allrecipes.com': 0.95,
17
+ 'foodnetwork.com': 0.90,
18
+ 'epicurious.com': 0.88,
19
  'medlineplus.gov': 0.95,
20
  'nih.gov': 0.98,
21
  'cdc.gov': 0.98,
 
29
  'hellobacsi.com': 0.85,
30
  'alobacsi.com': 0.82,
31
  'vinmec.com': 0.88,
32
+ 'monngonviet.com': 0.85,
33
  'medlatec.vn': 0.83,
34
  'suckhoedoisong.vn': 0.90,
35
  'viendinhduong.vn': 0.87,
 
40
  'chunyuyisheng.com': 0.84,
41
  'xywy.com': 0.82,
42
  'jiankang.com': 0.80,
43
+ 'xiachufang.com': 0.85,
44
 
45
  # Video platforms
46
  'youtube.com': 0.70,
 
50
  # Source type classification
51
  self.source_types = {
52
  'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
53
+ 'cooking_sites': ['allrecipes.com', 'foodnetwork.com', 'epicurious.com'],
54
+ 'commercial': ['seriouseats.com', 'bonappetit.com', 'tasteofhome.com'],
55
  'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
56
  'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
57
  'video': ['youtube.com', 'medscape.com']
 
325
  # Create type indicator
326
  type_icons = {
327
  'academic': '🎓',
328
+ 'cooking_sites': '🍳',
329
  'government': '🏛️',
330
  'commercial': '💼',
331
  'professional': '👨‍⚕️',
search/search.py CHANGED
@@ -4,6 +4,7 @@ import time
4
  import hashlib
5
  from .engines.duckduckgo import DuckDuckGoEngine
6
  from .engines.video import VideoSearchEngine
 
7
  from .coordinator import SearchCoordinator
8
  # Reranker removed - using simple relevance scoring for cooking content
9
  from models import summarizer
@@ -13,6 +14,7 @@ logger = logging.getLogger(__name__)
13
  # Global instances
14
  _duckduckgo_engine = None
15
  _video_engine = None
 
16
  _reranker = None
17
  _search_coordinator = None
18
 
@@ -34,6 +36,13 @@ def get_video_engine() -> VideoSearchEngine:
34
  _video_engine = VideoSearchEngine()
35
  return _video_engine
36
 
 
 
 
 
 
 
 
37
  def get_reranker():
38
  """Simple cooking relevance scorer - no complex reranking needed"""
39
  return None
@@ -237,8 +246,20 @@ def search_videos(query: str, num_results: int = 2, target_language: str = None)
237
  logger.error(f"Video search failed: {e}")
238
  return []
239
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # Comprehensive search function with maximum information extraction
241
- def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
242
  """Comprehensive search with maximum information extraction and detailed references"""
243
  logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
244
 
@@ -299,8 +320,20 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
299
  except Exception as e:
300
  logger.warning(f"Video search failed: {e}")
301
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  # Combine all results
303
- all_results = text_results + video_results
304
 
305
  # Simple cooking relevance filtering
306
  if all_results:
@@ -351,7 +384,10 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
351
  'total_sources': len(all_results),
352
  'text_sources': len(text_results),
353
  'video_sources': len(video_results),
354
- 'sources': all_results
 
 
 
355
  }
356
 
357
  logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
 
4
  import hashlib
5
  from .engines.duckduckgo import DuckDuckGoEngine
6
  from .engines.video import VideoSearchEngine
7
+ from .engines.image import ImageSearchEngine
8
  from .coordinator import SearchCoordinator
9
  # Reranker removed - using simple relevance scoring for cooking content
10
  from models import summarizer
 
14
  # Global instances
15
  _duckduckgo_engine = None
16
  _video_engine = None
17
+ _image_engine = None
18
  _reranker = None
19
  _search_coordinator = None
20
 
 
36
  _video_engine = VideoSearchEngine()
37
  return _video_engine
38
 
39
+ def get_image_engine() -> ImageSearchEngine:
40
+ """Get or create the global image engine instance"""
41
+ global _image_engine
42
+ if _image_engine is None:
43
+ _image_engine = ImageSearchEngine()
44
+ return _image_engine
45
+
46
  def get_reranker():
47
  """Simple cooking relevance scorer - no complex reranking needed"""
48
  return None
 
246
  logger.error(f"Video search failed: {e}")
247
  return []
248
 
249
+ # Image search function
250
+ def search_images(query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
251
+ """Search for cooking-related images"""
252
+ try:
253
+ # Clean the query first
254
+ cleaned_query = _clean_search_query(query)
255
+ coordinator = get_search_coordinator()
256
+ return coordinator.image_search(cleaned_query, num_results, target_language)
257
+ except Exception as e:
258
+ logger.error(f"Image search failed: {e}")
259
+ return []
260
+
261
  # Comprehensive search function with maximum information extraction
262
+ def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True, include_images: bool = True) -> Tuple[str, Dict[int, str], Dict]:
263
  """Comprehensive search with maximum information extraction and detailed references"""
264
  logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
265
 
 
320
  except Exception as e:
321
  logger.warning(f"Video search failed: {e}")
322
 
323
+ # Search for images if requested
324
+ image_results = []
325
+ if include_images:
326
+ try:
327
+ image_engine = get_image_engine()
328
+ # Limit image results to avoid over-fetching
329
+ max_image_results = min(3, num_results // 5) # Max 3 or 1/5 of total
330
+ image_results = image_engine.search_cooking_images(boosted_query, max_image_results, search_language)
331
+ logger.info(f"Found {len(image_results)} image results")
332
+ except Exception as e:
333
+ logger.warning(f"Image search failed: {e}")
334
+
335
  # Combine all results
336
+ all_results = text_results + video_results + image_results
337
 
338
  # Simple cooking relevance filtering
339
  if all_results:
 
384
  'total_sources': len(all_results),
385
  'text_sources': len(text_results),
386
  'video_sources': len(video_results),
387
+ 'image_sources': len(image_results),
388
+ 'sources': all_results,
389
+ 'videos': video_results,
390
+ 'images': image_results
391
  }
392
 
393
  logger.info(f"Comprehensive search completed: {len(all_results)} total sources")