Spaces:
Sleeping
Sleeping
Commit
·
3685b45
1
Parent(s):
4bc06b1
Upd cooking-specs services. Upd multilinguability procesors
Browse files- api/chatbot.py +23 -10
- api/routes.py +7 -1
- search/coordinator.py +28 -1
- search/engines/duckduckgo.py +3 -3
- search/engines/image.py +231 -0
- search/engines/video.py +18 -27
- search/processors/cooking.py +7 -7
- search/processors/enhanced.py +3 -3
- search/processors/language.py +88 -90
- search/processors/sources.py +8 -8
- search/search.py +39 -3
api/chatbot.py
CHANGED
|
@@ -7,7 +7,6 @@ from .config import gemini_flash_api_key
|
|
| 7 |
from memory import MemoryManager
|
| 8 |
from utils import translate_query
|
| 9 |
from search import search_comprehensive
|
| 10 |
-
# Safety guard removed - cooking tutor doesn't need medical safety checks
|
| 11 |
|
| 12 |
logger = logging.getLogger("cooking-tutor")
|
| 13 |
|
|
@@ -66,9 +65,8 @@ class CookingTutorChatbot:
|
|
| 66 |
cuisine: str = None,
|
| 67 |
structured: bool = False,
|
| 68 |
) -> str:
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
user_query = translate_query(user_query, lang.lower())
|
| 72 |
|
| 73 |
# Basic cooking relevance check
|
| 74 |
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
|
|
@@ -88,11 +86,13 @@ class CookingTutorChatbot:
|
|
| 88 |
|
| 89 |
if search_mode:
|
| 90 |
try:
|
|
|
|
| 91 |
search_context, url_mapping, source_aggregation = search_comprehensive(
|
| 92 |
-
|
| 93 |
num_results=12,
|
| 94 |
target_language=lang,
|
| 95 |
-
include_videos=bool(video_mode)
|
|
|
|
| 96 |
)
|
| 97 |
if video_mode and source_aggregation:
|
| 98 |
video_results = source_aggregation.get('sources', []) or []
|
|
@@ -170,11 +170,24 @@ class CookingTutorChatbot:
|
|
| 170 |
if user_id:
|
| 171 |
self.memory.add_exchange(user_id, user_query, response, lang=lang)
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
if video_mode and video_results:
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
return response.strip()
|
| 179 |
|
| 180 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
|
|
|
| 7 |
from memory import MemoryManager
|
| 8 |
from utils import translate_query
|
| 9 |
from search import search_comprehensive
|
|
|
|
| 10 |
|
| 11 |
logger = logging.getLogger("cooking-tutor")
|
| 12 |
|
|
|
|
| 65 |
cuisine: str = None,
|
| 66 |
structured: bool = False,
|
| 67 |
) -> str:
|
| 68 |
+
# Keep original language for native search - no translation needed
|
| 69 |
+
# The search engines now support native language sources
|
|
|
|
| 70 |
|
| 71 |
# Basic cooking relevance check
|
| 72 |
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
|
|
|
|
| 86 |
|
| 87 |
if search_mode:
|
| 88 |
try:
|
| 89 |
+
# Use native language search for better results
|
| 90 |
search_context, url_mapping, source_aggregation = search_comprehensive(
|
| 91 |
+
user_query, # Use original query without English prefix
|
| 92 |
num_results=12,
|
| 93 |
target_language=lang,
|
| 94 |
+
include_videos=bool(video_mode),
|
| 95 |
+
include_images=True # Always include images for visual appeal
|
| 96 |
)
|
| 97 |
if video_mode and source_aggregation:
|
| 98 |
video_results = source_aggregation.get('sources', []) or []
|
|
|
|
| 170 |
if user_id:
|
| 171 |
self.memory.add_exchange(user_id, user_query, response, lang=lang)
|
| 172 |
|
| 173 |
+
# Prepare response with media
|
| 174 |
+
response_data = {
|
| 175 |
+
'text': response.strip()
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# Add videos if available
|
| 179 |
if video_mode and video_results:
|
| 180 |
+
response_data['videos'] = video_results
|
| 181 |
+
|
| 182 |
+
# Add images if available
|
| 183 |
+
if source_aggregation and 'images' in source_aggregation:
|
| 184 |
+
images = source_aggregation['images']
|
| 185 |
+
if images:
|
| 186 |
+
response_data['images'] = images[:3] # Limit to 3 images
|
| 187 |
+
|
| 188 |
+
# Return structured response if we have media, otherwise just text
|
| 189 |
+
if len(response_data) > 1:
|
| 190 |
+
return response_data
|
| 191 |
return response.strip()
|
| 192 |
|
| 193 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
api/routes.py
CHANGED
|
@@ -59,13 +59,15 @@ async def chat_endpoint(req: Request):
|
|
| 59 |
)
|
| 60 |
elapsed = time.time() - start
|
| 61 |
|
| 62 |
-
# Handle response format (might be string or dict with videos)
|
| 63 |
if isinstance(answer, dict):
|
| 64 |
response_text = answer.get('text', '')
|
| 65 |
video_data = answer.get('videos', [])
|
|
|
|
| 66 |
else:
|
| 67 |
response_text = answer
|
| 68 |
video_data = []
|
|
|
|
| 69 |
|
| 70 |
# Final response
|
| 71 |
response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
|
|
@@ -74,6 +76,10 @@ async def chat_endpoint(req: Request):
|
|
| 74 |
if video_data:
|
| 75 |
response_data["videos"] = video_data
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
return JSONResponse(response_data)
|
| 78 |
|
| 79 |
except Exception as e:
|
|
|
|
| 59 |
)
|
| 60 |
elapsed = time.time() - start
|
| 61 |
|
| 62 |
+
# Handle response format (might be string or dict with videos/images)
|
| 63 |
if isinstance(answer, dict):
|
| 64 |
response_text = answer.get('text', '')
|
| 65 |
video_data = answer.get('videos', [])
|
| 66 |
+
image_data = answer.get('images', [])
|
| 67 |
else:
|
| 68 |
response_text = answer
|
| 69 |
video_data = []
|
| 70 |
+
image_data = []
|
| 71 |
|
| 72 |
# Final response
|
| 73 |
response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
|
|
|
|
| 76 |
if video_data:
|
| 77 |
response_data["videos"] = video_data
|
| 78 |
|
| 79 |
+
# Include image data if available
|
| 80 |
+
if image_data:
|
| 81 |
+
response_data["images"] = image_data
|
| 82 |
+
|
| 83 |
return JSONResponse(response_data)
|
| 84 |
|
| 85 |
except Exception as e:
|
search/coordinator.py
CHANGED
|
@@ -7,6 +7,7 @@ from .engines.duckduckgo import DuckDuckGoEngine
|
|
| 7 |
from .engines.cooking import CookingSearchEngine
|
| 8 |
from .engines.multilingual import MultilingualCookingEngine
|
| 9 |
from .engines.video import VideoSearchEngine
|
|
|
|
| 10 |
from .extractors.content import ContentExtractor
|
| 11 |
from .processors.cooking import CookingSearchProcessor
|
| 12 |
from .processors.language import LanguageProcessor
|
|
@@ -27,6 +28,7 @@ class SearchCoordinator:
|
|
| 27 |
self.cooking_engine = CookingSearchEngine()
|
| 28 |
self.multilingual_engine = MultilingualCookingEngine()
|
| 29 |
self.video_engine = VideoSearchEngine()
|
|
|
|
| 30 |
|
| 31 |
# Initialize processors
|
| 32 |
self.content_extractor = ContentExtractor()
|
|
@@ -105,7 +107,7 @@ class SearchCoordinator:
|
|
| 105 |
return summary, url_mapping
|
| 106 |
|
| 107 |
def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 108 |
-
"""Search using multilingual
|
| 109 |
try:
|
| 110 |
if language:
|
| 111 |
results = self.multilingual_engine.search_by_language(query, language, num_results)
|
|
@@ -454,6 +456,31 @@ class SearchCoordinator:
|
|
| 454 |
|
| 455 |
logger.info(f"Video search completed: {len(video_results)} videos found")
|
| 456 |
return video_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
|
| 459 |
"""Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
|
|
|
|
| 7 |
from .engines.cooking import CookingSearchEngine
|
| 8 |
from .engines.multilingual import MultilingualCookingEngine
|
| 9 |
from .engines.video import VideoSearchEngine
|
| 10 |
+
from .engines.image import ImageSearchEngine
|
| 11 |
from .extractors.content import ContentExtractor
|
| 12 |
from .processors.cooking import CookingSearchProcessor
|
| 13 |
from .processors.language import LanguageProcessor
|
|
|
|
| 28 |
self.cooking_engine = CookingSearchEngine()
|
| 29 |
self.multilingual_engine = MultilingualCookingEngine()
|
| 30 |
self.video_engine = VideoSearchEngine()
|
| 31 |
+
self.image_engine = ImageSearchEngine()
|
| 32 |
|
| 33 |
# Initialize processors
|
| 34 |
self.content_extractor = ContentExtractor()
|
|
|
|
| 107 |
return summary, url_mapping
|
| 108 |
|
| 109 |
def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 110 |
+
"""Search using multilingual cooking engine"""
|
| 111 |
try:
|
| 112 |
if language:
|
| 113 |
results = self.multilingual_engine.search_by_language(query, language, num_results)
|
|
|
|
| 456 |
|
| 457 |
logger.info(f"Video search completed: {len(video_results)} videos found")
|
| 458 |
return video_results
|
| 459 |
+
|
| 460 |
+
def image_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
|
| 461 |
+
"""Search for cooking-related images"""
|
| 462 |
+
logger.info(f"Image search for: {query} (target: {target_language})")
|
| 463 |
+
|
| 464 |
+
# Detect language if not provided
|
| 465 |
+
if not target_language:
|
| 466 |
+
target_language = self.language_processor.detect_language(query)
|
| 467 |
+
|
| 468 |
+
# Map language codes
|
| 469 |
+
lang_mapping = {
|
| 470 |
+
'EN': 'en',
|
| 471 |
+
'VI': 'vi',
|
| 472 |
+
'ZH': 'zh',
|
| 473 |
+
'en': 'en',
|
| 474 |
+
'vi': 'vi',
|
| 475 |
+
'zh': 'zh'
|
| 476 |
+
}
|
| 477 |
+
search_language = lang_mapping.get(target_language, 'en')
|
| 478 |
+
|
| 479 |
+
# Search for images
|
| 480 |
+
image_results = self.image_engine.search_cooking_images(query, num_results, search_language)
|
| 481 |
+
|
| 482 |
+
logger.info(f"Image search completed: {len(image_results)} images found")
|
| 483 |
+
return image_results
|
| 484 |
|
| 485 |
def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
|
| 486 |
"""Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
|
search/engines/duckduckgo.py
CHANGED
|
@@ -143,11 +143,11 @@ class DuckDuckGoEngine:
|
|
| 143 |
return ' '.join(words[:3]) # Max 3 words
|
| 144 |
|
| 145 |
def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
|
| 146 |
-
"""Filter out irrelevant sources like generic
|
| 147 |
import re
|
| 148 |
filtered = []
|
| 149 |
|
| 150 |
-
# Only exclude obvious non-
|
| 151 |
exclude_patterns = [
|
| 152 |
r'/quiz$', # Quiz pages (end of URL)
|
| 153 |
r'/test$', # Test pages (end of URL)
|
|
@@ -325,7 +325,7 @@ class DuckDuckGoEngine:
|
|
| 325 |
'format': 'json',
|
| 326 |
'no_html': '1',
|
| 327 |
'skip_disambig': '1',
|
| 328 |
-
't': '
|
| 329 |
}
|
| 330 |
|
| 331 |
response = self.session.get(url, params=params, timeout=self.timeout)
|
|
|
|
| 143 |
return ' '.join(words[:3]) # Max 3 words
|
| 144 |
|
| 145 |
def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
|
| 146 |
+
"""Filter out irrelevant sources like generic pages, quizzes, etc."""
|
| 147 |
import re
|
| 148 |
filtered = []
|
| 149 |
|
| 150 |
+
# Only exclude obvious non-cooking content
|
| 151 |
exclude_patterns = [
|
| 152 |
r'/quiz$', # Quiz pages (end of URL)
|
| 153 |
r'/test$', # Test pages (end of URL)
|
|
|
|
| 325 |
'format': 'json',
|
| 326 |
'no_html': '1',
|
| 327 |
'skip_disambig': '1',
|
| 328 |
+
't': 'CookingTutor'
|
| 329 |
}
|
| 330 |
|
| 331 |
response = self.session.get(url, params=params, timeout=self.timeout)
|
search/engines/image.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class ImageSearchEngine:
|
| 11 |
+
"""Search engine for cooking-related images"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, timeout: int = 15):
|
| 14 |
+
self.session = requests.Session()
|
| 15 |
+
self.session.headers.update({
|
| 16 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 17 |
+
})
|
| 18 |
+
self.timeout = timeout
|
| 19 |
+
|
| 20 |
+
def search_cooking_images(self, query: str, num_results: int = 3, language: str = "en") -> List[Dict]:
|
| 21 |
+
"""Search for cooking-related images"""
|
| 22 |
+
results = []
|
| 23 |
+
|
| 24 |
+
# Try multiple image search strategies
|
| 25 |
+
strategies = [
|
| 26 |
+
self._search_google_images,
|
| 27 |
+
self._search_bing_images,
|
| 28 |
+
self._search_unsplash
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
for strategy in strategies:
|
| 32 |
+
try:
|
| 33 |
+
strategy_results = strategy(query, num_results, language)
|
| 34 |
+
if strategy_results:
|
| 35 |
+
results.extend(strategy_results)
|
| 36 |
+
logger.info(f"Image search found {len(strategy_results)} results")
|
| 37 |
+
if len(results) >= num_results:
|
| 38 |
+
break
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.warning(f"Image search strategy failed: {e}")
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
return results[:num_results]
|
| 44 |
+
|
| 45 |
+
def _search_google_images(self, query: str, num_results: int, language: str) -> List[Dict]:
|
| 46 |
+
"""Search Google Images for cooking content"""
|
| 47 |
+
try:
|
| 48 |
+
# Add cooking context to improve relevance
|
| 49 |
+
cooking_query = f"{query} recipe cooking food dish"
|
| 50 |
+
|
| 51 |
+
url = "https://www.google.com/search"
|
| 52 |
+
params = {
|
| 53 |
+
'q': cooking_query,
|
| 54 |
+
'tbm': 'isch', # Image search
|
| 55 |
+
'hl': language,
|
| 56 |
+
'safe': 'active'
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 60 |
+
response.raise_for_status()
|
| 61 |
+
|
| 62 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 63 |
+
results = []
|
| 64 |
+
|
| 65 |
+
# Find image containers
|
| 66 |
+
image_containers = soup.find_all('div', class_='islrc')
|
| 67 |
+
|
| 68 |
+
for container in image_containers[:num_results]:
|
| 69 |
+
try:
|
| 70 |
+
# Extract image URL
|
| 71 |
+
img_tag = container.find('img')
|
| 72 |
+
if not img_tag:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
img_url = img_tag.get('src') or img_tag.get('data-src')
|
| 76 |
+
if not img_url or not img_url.startswith('http'):
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
# Extract title/alt text
|
| 80 |
+
title = img_tag.get('alt', '') or img_tag.get('title', '')
|
| 81 |
+
|
| 82 |
+
# Extract source URL
|
| 83 |
+
link_tag = container.find('a')
|
| 84 |
+
source_url = link_tag.get('href', '') if link_tag else ''
|
| 85 |
+
|
| 86 |
+
results.append({
|
| 87 |
+
'url': img_url,
|
| 88 |
+
'title': title,
|
| 89 |
+
'source_url': source_url,
|
| 90 |
+
'source': 'google_images',
|
| 91 |
+
'type': 'image'
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.debug(f"Error parsing Google image: {e}")
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
return results
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.warning(f"Google Images search failed: {e}")
|
| 102 |
+
return []
|
| 103 |
+
|
| 104 |
+
def _search_bing_images(self, query: str, num_results: int, language: str) -> List[Dict]:
|
| 105 |
+
"""Search Bing Images for cooking content"""
|
| 106 |
+
try:
|
| 107 |
+
cooking_query = f"{query} recipe cooking food"
|
| 108 |
+
|
| 109 |
+
url = "https://www.bing.com/images/search"
|
| 110 |
+
params = {
|
| 111 |
+
'q': cooking_query,
|
| 112 |
+
'qft': '+filterui:imagesize-large', # Large images
|
| 113 |
+
'form': 'HDRSC2',
|
| 114 |
+
'first': '1',
|
| 115 |
+
'count': num_results
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 119 |
+
response.raise_for_status()
|
| 120 |
+
|
| 121 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 122 |
+
results = []
|
| 123 |
+
|
| 124 |
+
# Find image containers
|
| 125 |
+
image_containers = soup.find_all('div', class_='img_cont')
|
| 126 |
+
|
| 127 |
+
for container in image_containers[:num_results]:
|
| 128 |
+
try:
|
| 129 |
+
img_tag = container.find('img')
|
| 130 |
+
if not img_tag:
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
img_url = img_tag.get('src') or img_tag.get('data-src')
|
| 134 |
+
if not img_url or not img_url.startswith('http'):
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
title = img_tag.get('alt', '') or img_tag.get('title', '')
|
| 138 |
+
|
| 139 |
+
results.append({
|
| 140 |
+
'url': img_url,
|
| 141 |
+
'title': title,
|
| 142 |
+
'source_url': '',
|
| 143 |
+
'source': 'bing_images',
|
| 144 |
+
'type': 'image'
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.debug(f"Error parsing Bing image: {e}")
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
return results
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.warning(f"Bing Images search failed: {e}")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
def _search_unsplash(self, query: str, num_results: int, language: str) -> List[Dict]:
|
| 158 |
+
"""Search Unsplash for high-quality cooking images"""
|
| 159 |
+
try:
|
| 160 |
+
cooking_query = f"{query} food cooking recipe"
|
| 161 |
+
|
| 162 |
+
url = "https://unsplash.com/s/photos/" + cooking_query.replace(' ', '-')
|
| 163 |
+
|
| 164 |
+
response = self.session.get(url, timeout=self.timeout)
|
| 165 |
+
response.raise_for_status()
|
| 166 |
+
|
| 167 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 168 |
+
results = []
|
| 169 |
+
|
| 170 |
+
# Find image containers
|
| 171 |
+
image_containers = soup.find_all('figure')
|
| 172 |
+
|
| 173 |
+
for container in image_containers[:num_results]:
|
| 174 |
+
try:
|
| 175 |
+
img_tag = container.find('img')
|
| 176 |
+
if not img_tag:
|
| 177 |
+
continue
|
| 178 |
+
|
| 179 |
+
img_url = img_tag.get('src') or img_tag.get('data-src')
|
| 180 |
+
if not img_url or not img_url.startswith('http'):
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
title = img_tag.get('alt', '') or img_tag.get('title', '')
|
| 184 |
+
|
| 185 |
+
# Get source URL
|
| 186 |
+
link_tag = container.find('a')
|
| 187 |
+
source_url = link_tag.get('href', '') if link_tag else ''
|
| 188 |
+
if source_url and not source_url.startswith('http'):
|
| 189 |
+
source_url = 'https://unsplash.com' + source_url
|
| 190 |
+
|
| 191 |
+
results.append({
|
| 192 |
+
'url': img_url,
|
| 193 |
+
'title': title,
|
| 194 |
+
'source_url': source_url,
|
| 195 |
+
'source': 'unsplash',
|
| 196 |
+
'type': 'image'
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.debug(f"Error parsing Unsplash image: {e}")
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
return results
|
| 204 |
+
|
| 205 |
+
except Exception as e:
|
| 206 |
+
logger.warning(f"Unsplash search failed: {e}")
|
| 207 |
+
return []
|
| 208 |
+
|
| 209 |
+
def _filter_cooking_relevance(self, images: List[Dict], query: str) -> List[Dict]:
|
| 210 |
+
"""Filter images for cooking relevance"""
|
| 211 |
+
cooking_keywords = [
|
| 212 |
+
'food', 'cooking', 'recipe', 'dish', 'meal', 'ingredient', 'kitchen',
|
| 213 |
+
'chef', 'bake', 'cook', 'preparation', 'cuisine', 'delicious', 'tasty'
|
| 214 |
+
]
|
| 215 |
+
|
| 216 |
+
relevant_images = []
|
| 217 |
+
query_lower = query.lower()
|
| 218 |
+
|
| 219 |
+
for image in images:
|
| 220 |
+
title = image.get('title', '').lower()
|
| 221 |
+
|
| 222 |
+
# Check if title contains cooking keywords or query terms
|
| 223 |
+
is_relevant = (
|
| 224 |
+
any(keyword in title for keyword in cooking_keywords) or
|
| 225 |
+
any(word in title for word in query_lower.split() if len(word) > 3)
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
if is_relevant:
|
| 229 |
+
relevant_images.append(image)
|
| 230 |
+
|
| 231 |
+
return relevant_images
|
search/engines/video.py
CHANGED
|
@@ -50,13 +50,6 @@ class VideoSearchEngine:
|
|
| 50 |
'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
|
| 51 |
'base_url': 'https://www.youtube.com'
|
| 52 |
},
|
| 53 |
-
{
|
| 54 |
-
'name': 'vinmec_videos',
|
| 55 |
-
'search_url': 'https://www.vinmec.com/vi/tim-kiem',
|
| 56 |
-
'params': {'q': ''},
|
| 57 |
-
'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
|
| 58 |
-
'base_url': 'https://www.vinmec.com'
|
| 59 |
-
}
|
| 60 |
],
|
| 61 |
'zh': [
|
| 62 |
{
|
|
@@ -87,8 +80,8 @@ class VideoSearchEngine:
|
|
| 87 |
q = re.sub(r"\s+", " ", q)
|
| 88 |
return q.strip()
|
| 89 |
|
| 90 |
-
def
|
| 91 |
-
"""Check if video is
|
| 92 |
url = result.get('url', '')
|
| 93 |
title = result.get('title', '')
|
| 94 |
|
|
@@ -96,25 +89,23 @@ class VideoSearchEngine:
|
|
| 96 |
if 'results?search_query=' in url:
|
| 97 |
return False
|
| 98 |
|
| 99 |
-
# Skip non-YouTube URLs that aren't
|
| 100 |
-
if 'youtube.com' not in url and not any(
|
| 101 |
return False
|
| 102 |
|
| 103 |
-
# Check if title contains
|
| 104 |
title_lower = title.lower()
|
| 105 |
query_lower = query.lower()
|
| 106 |
|
| 107 |
-
|
| 108 |
-
'
|
| 109 |
-
'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
|
| 110 |
-
'disease', 'condition', 'healthcare', 'physician'
|
| 111 |
]
|
| 112 |
|
| 113 |
-
# Must contain
|
| 114 |
-
|
| 115 |
has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
|
| 116 |
|
| 117 |
-
return
|
| 118 |
|
| 119 |
def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
|
| 120 |
"""Search platform with retry logic and better error handling"""
|
|
@@ -130,9 +121,9 @@ class VideoSearchEngine:
|
|
| 130 |
return []
|
| 131 |
|
| 132 |
def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
|
| 133 |
-
"""Search for
|
| 134 |
query = self._normalize_query(query)
|
| 135 |
-
logger.info(f"Searching for
|
| 136 |
|
| 137 |
results = []
|
| 138 |
seen_urls = set() # Track URLs to avoid duplicates
|
|
@@ -152,7 +143,7 @@ class VideoSearchEngine:
|
|
| 152 |
logger.warning(f"No results from {platform['name']}")
|
| 153 |
continue
|
| 154 |
|
| 155 |
-
# Filter out duplicates and non-
|
| 156 |
for result in platform_results:
|
| 157 |
url = result.get('url', '')
|
| 158 |
video_id = self._extract_video_id(url)
|
|
@@ -161,8 +152,8 @@ class VideoSearchEngine:
|
|
| 161 |
if url in seen_urls or (video_id and video_id in seen_video_ids):
|
| 162 |
continue
|
| 163 |
|
| 164 |
-
# Check if it's a valid
|
| 165 |
-
if self.
|
| 166 |
seen_urls.add(url)
|
| 167 |
if video_id:
|
| 168 |
seen_video_ids.add(video_id)
|
|
@@ -192,7 +183,7 @@ class VideoSearchEngine:
|
|
| 192 |
|
| 193 |
if (url not in seen_urls and
|
| 194 |
video_id not in seen_video_ids and
|
| 195 |
-
self.
|
| 196 |
seen_urls.add(url)
|
| 197 |
if video_id:
|
| 198 |
seen_video_ids.add(video_id)
|
|
@@ -373,8 +364,8 @@ class VideoSearchEngine:
|
|
| 373 |
fallback_videos = {
|
| 374 |
'en': [
|
| 375 |
{
|
| 376 |
-
'url': 'https://www.youtube.com/results?search_query=
|
| 377 |
-
'title': f'
|
| 378 |
'platform': 'youtube_fallback',
|
| 379 |
'type': 'video',
|
| 380 |
'source': 'youtube'
|
|
|
|
| 50 |
'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
|
| 51 |
'base_url': 'https://www.youtube.com'
|
| 52 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
],
|
| 54 |
'zh': [
|
| 55 |
{
|
|
|
|
| 80 |
q = re.sub(r"\s+", " ", q)
|
| 81 |
return q.strip()
|
| 82 |
|
| 83 |
+
def _is_valid_cooking_video(self, result: Dict, query: str) -> bool:
|
| 84 |
+
"""Check if video is cooking-relevant and has valid URL"""
|
| 85 |
url = result.get('url', '')
|
| 86 |
title = result.get('title', '')
|
| 87 |
|
|
|
|
| 89 |
if 'results?search_query=' in url:
|
| 90 |
return False
|
| 91 |
|
| 92 |
+
# Skip non-YouTube URLs that aren't cooking platforms
|
| 93 |
+
if 'youtube.com' not in url and not any(cook in url for cook in ['allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com']):
|
| 94 |
return False
|
| 95 |
|
| 96 |
+
# Check if title contains cooking keywords or query terms
|
| 97 |
title_lower = title.lower()
|
| 98 |
query_lower = query.lower()
|
| 99 |
|
| 100 |
+
cooking_keywords = [
|
| 101 |
+
'recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner'
|
|
|
|
|
|
|
| 102 |
]
|
| 103 |
|
| 104 |
+
# Must contain cooking keywords or query terms
|
| 105 |
+
has_cooking = any(keyword in title_lower for keyword in cooking_keywords)
|
| 106 |
has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
|
| 107 |
|
| 108 |
+
return has_cooking or has_query
|
| 109 |
|
| 110 |
def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
|
| 111 |
"""Search platform with retry logic and better error handling"""
|
|
|
|
| 121 |
return []
|
| 122 |
|
| 123 |
def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
|
| 124 |
+
"""Search for cooking videos across platforms with deduplication and cooking filtering"""
|
| 125 |
query = self._normalize_query(query)
|
| 126 |
+
logger.info(f"Searching for cooking videos: {query} (language: {language})")
|
| 127 |
|
| 128 |
results = []
|
| 129 |
seen_urls = set() # Track URLs to avoid duplicates
|
|
|
|
| 143 |
logger.warning(f"No results from {platform['name']}")
|
| 144 |
continue
|
| 145 |
|
| 146 |
+
# Filter out duplicates and non-cooking content
|
| 147 |
for result in platform_results:
|
| 148 |
url = result.get('url', '')
|
| 149 |
video_id = self._extract_video_id(url)
|
|
|
|
| 152 |
if url in seen_urls or (video_id and video_id in seen_video_ids):
|
| 153 |
continue
|
| 154 |
|
| 155 |
+
# Check if it's a valid cooking video (less strict for more results)
|
| 156 |
+
if self._is_valid_cooking_video(result, query):
|
| 157 |
seen_urls.add(url)
|
| 158 |
if video_id:
|
| 159 |
seen_video_ids.add(video_id)
|
|
|
|
| 183 |
|
| 184 |
if (url not in seen_urls and
|
| 185 |
video_id not in seen_video_ids and
|
| 186 |
+
self._is_valid_cooking_video(result, query)):
|
| 187 |
seen_urls.add(url)
|
| 188 |
if video_id:
|
| 189 |
seen_video_ids.add(video_id)
|
|
|
|
| 364 |
fallback_videos = {
|
| 365 |
'en': [
|
| 366 |
{
|
| 367 |
+
'url': 'https://www.youtube.com/results?search_query=cooking+' + quote(query),
|
| 368 |
+
'title': f'Cooking Videos: {query}',
|
| 369 |
'platform': 'youtube_fallback',
|
| 370 |
'type': 'video',
|
| 371 |
'source': 'youtube'
|
search/processors/cooking.py
CHANGED
|
@@ -30,7 +30,7 @@ class CookingSearchProcessor:
|
|
| 30 |
],
|
| 31 |
'dietary': [
|
| 32 |
'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
|
| 33 |
-
'healthy', 'low-carb', 'low-fat', 'protein', 'fiber'
|
| 34 |
],
|
| 35 |
'meal_types': [
|
| 36 |
'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
|
|
@@ -272,12 +272,12 @@ class CookingSearchProcessor:
|
|
| 272 |
|
| 273 |
# Add topic header
|
| 274 |
topic_headers = {
|
| 275 |
-
'recipes': "
|
| 276 |
-
'techniques': "
|
| 277 |
-
'ingredients': "
|
| 278 |
-
'equipment': "
|
| 279 |
-
'tips_tricks': "
|
| 280 |
-
'general': "
|
| 281 |
}
|
| 282 |
|
| 283 |
header = topic_headers.get(topic, "**Information:**")
|
|
|
|
| 30 |
],
|
| 31 |
'dietary': [
|
| 32 |
'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
|
| 33 |
+
'healthy', 'low-carb', 'low-fat', 'protein', 'fiber', 'nutritious', 'balanced'
|
| 34 |
],
|
| 35 |
'meal_types': [
|
| 36 |
'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
|
|
|
|
| 272 |
|
| 273 |
# Add topic header
|
| 274 |
topic_headers = {
|
| 275 |
+
'recipes': "**Recipes and Instructions:**",
|
| 276 |
+
'techniques': "**Cooking Techniques:**",
|
| 277 |
+
'ingredients': "**Ingredients and Substitutions:**",
|
| 278 |
+
'equipment': "**Equipment and Tools:**",
|
| 279 |
+
'tips_tricks': "**Tips and Tricks:**",
|
| 280 |
+
'general': "**General Information:**"
|
| 281 |
}
|
| 282 |
|
| 283 |
header = topic_headers.get(topic, "**Information:**")
|
search/processors/enhanced.py
CHANGED
|
@@ -92,7 +92,7 @@ class EnhancedContentProcessor:
|
|
| 92 |
return comprehensive_summary, reference_mapping
|
| 93 |
|
| 94 |
def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
|
| 95 |
-
"""Extract structured information by
|
| 96 |
structured_info = defaultdict(list)
|
| 97 |
|
| 98 |
for source in sources:
|
|
@@ -100,8 +100,8 @@ class EnhancedContentProcessor:
|
|
| 100 |
if not content:
|
| 101 |
continue
|
| 102 |
|
| 103 |
-
# Extract information by
|
| 104 |
-
for category, patterns in self.
|
| 105 |
extracted_info = self._extract_category_info(content, patterns, category, user_query)
|
| 106 |
if extracted_info:
|
| 107 |
structured_info[category].append({
|
|
|
|
| 92 |
return comprehensive_summary, reference_mapping
|
| 93 |
|
| 94 |
def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
|
| 95 |
+
"""Extract structured information by cooking categories"""
|
| 96 |
structured_info = defaultdict(list)
|
| 97 |
|
| 98 |
for source in sources:
|
|
|
|
| 100 |
if not content:
|
| 101 |
continue
|
| 102 |
|
| 103 |
+
# Extract information by cooking categories
|
| 104 |
+
for category, patterns in self.cooking_patterns.items():
|
| 105 |
extracted_info = self._extract_category_info(content, patterns, category, user_query)
|
| 106 |
if extracted_info:
|
| 107 |
structured_info[category].append({
|
search/processors/language.py
CHANGED
|
@@ -10,57 +10,59 @@ logger = logging.getLogger(__name__)
|
|
| 10 |
DetectorFactory.seed = 0
|
| 11 |
|
| 12 |
class LanguageProcessor:
|
| 13 |
-
"""Process and enhance queries for multilingual
|
| 14 |
|
| 15 |
def __init__(self):
|
| 16 |
-
#
|
| 17 |
-
self.
|
| 18 |
'en': [
|
| 19 |
-
'
|
| 20 |
-
'
|
| 21 |
-
'
|
| 22 |
-
'
|
| 23 |
-
'
|
| 24 |
-
'
|
| 25 |
-
'
|
| 26 |
-
'
|
| 27 |
-
'
|
| 28 |
],
|
| 29 |
'vi': [
|
| 30 |
-
'
|
| 31 |
-
'
|
| 32 |
-
'
|
| 33 |
-
'
|
| 34 |
-
'
|
| 35 |
-
'
|
| 36 |
-
'
|
| 37 |
-
'
|
| 38 |
-
'
|
| 39 |
],
|
| 40 |
'zh': [
|
| 41 |
-
'
|
| 42 |
-
'
|
| 43 |
-
'
|
| 44 |
-
'
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
-
'
|
|
|
|
|
|
|
| 48 |
]
|
| 49 |
}
|
| 50 |
|
| 51 |
# Language-specific search enhancements
|
| 52 |
self.language_enhancements = {
|
| 53 |
'vi': {
|
| 54 |
-
'common_terms': ['là gì', '
|
| 55 |
-
'
|
| 56 |
},
|
| 57 |
'zh': {
|
| 58 |
-
'common_terms': ['是什么', '
|
| 59 |
-
'
|
| 60 |
},
|
| 61 |
'en': {
|
| 62 |
-
'common_terms': ['what is', '
|
| 63 |
-
'
|
| 64 |
}
|
| 65 |
}
|
| 66 |
|
|
@@ -136,16 +138,16 @@ class LanguageProcessor:
|
|
| 136 |
"""Enhance query for a specific language"""
|
| 137 |
enhancements = self.language_enhancements.get(language, {})
|
| 138 |
common_terms = enhancements.get('common_terms', [])
|
| 139 |
-
|
| 140 |
|
| 141 |
-
# Check if query already contains
|
| 142 |
query_lower = query.lower()
|
| 143 |
-
|
| 144 |
|
| 145 |
-
# If no
|
| 146 |
-
if not
|
| 147 |
-
# Add the most relevant
|
| 148 |
-
query += f" {
|
| 149 |
|
| 150 |
# Check if query is a question and add relevant terms
|
| 151 |
if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
|
|
@@ -155,62 +157,58 @@ class LanguageProcessor:
|
|
| 155 |
return query.strip()
|
| 156 |
|
| 157 |
def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
|
| 158 |
-
"""Simple keyword-based translation for
|
| 159 |
# This is a basic implementation - in production, you'd use a proper translation service
|
| 160 |
|
| 161 |
-
#
|
| 162 |
translations = {
|
| 163 |
('vi', 'en'): {
|
| 164 |
-
'
|
| 165 |
-
'
|
| 166 |
-
'
|
| 167 |
-
'
|
| 168 |
-
'
|
| 169 |
-
'
|
| 170 |
-
'
|
| 171 |
-
'
|
| 172 |
-
'
|
| 173 |
-
'
|
| 174 |
-
'bệnh viện': 'hospital'
|
| 175 |
},
|
| 176 |
('zh', 'en'): {
|
| 177 |
-
'
|
| 178 |
-
'
|
| 179 |
-
'
|
| 180 |
-
'
|
| 181 |
-
'
|
| 182 |
-
'
|
| 183 |
-
'
|
| 184 |
-
'
|
| 185 |
-
'
|
| 186 |
-
'
|
| 187 |
-
'医院': 'hospital'
|
| 188 |
},
|
| 189 |
('en', 'vi'): {
|
| 190 |
-
'
|
| 191 |
-
'
|
| 192 |
-
'
|
| 193 |
-
'
|
| 194 |
-
'
|
| 195 |
-
'
|
| 196 |
-
'
|
| 197 |
-
'
|
| 198 |
-
'
|
| 199 |
-
'
|
| 200 |
-
'hospital': 'bệnh viện'
|
| 201 |
},
|
| 202 |
('en', 'zh'): {
|
| 203 |
-
'
|
| 204 |
-
'
|
| 205 |
-
'
|
| 206 |
-
'
|
| 207 |
-
'
|
| 208 |
-
'
|
| 209 |
-
'
|
| 210 |
-
'
|
| 211 |
-
'
|
| 212 |
-
'
|
| 213 |
-
'hospital': '医院'
|
| 214 |
}
|
| 215 |
}
|
| 216 |
|
|
@@ -223,12 +221,12 @@ class LanguageProcessor:
|
|
| 223 |
|
| 224 |
return translated_query
|
| 225 |
|
| 226 |
-
def
|
| 227 |
-
"""Calculate
|
| 228 |
if not text:
|
| 229 |
return 0.0
|
| 230 |
|
| 231 |
-
keywords = self.
|
| 232 |
if not keywords:
|
| 233 |
return 0.0
|
| 234 |
|
|
|
|
| 10 |
DetectorFactory.seed = 0
|
| 11 |
|
| 12 |
class LanguageProcessor:
|
| 13 |
+
"""Process and enhance queries for multilingual cooking search"""
|
| 14 |
|
| 15 |
def __init__(self):
|
| 16 |
+
# Cooking keywords in different languages
|
| 17 |
+
self.cooking_keywords = {
|
| 18 |
'en': [
|
| 19 |
+
'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
|
| 20 |
+
'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
|
| 21 |
+
'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
|
| 22 |
+
'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
|
| 23 |
+
'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
|
| 24 |
+
'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
|
| 25 |
+
'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
|
| 26 |
+
'substitution', 'alternative', 'variation', 'modification', 'adaptation',
|
| 27 |
+
'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
|
| 28 |
],
|
| 29 |
'vi': [
|
| 30 |
+
'công thức', 'nấu ăn', 'nướng', 'rang', 'nướng vỉ', 'chiên', 'luộc', 'hấp',
|
| 31 |
+
'nguyên liệu', 'gia vị', 'thảo mộc', 'nước sốt', 'tẩm ướp', 'dressing',
|
| 32 |
+
'kỹ thuật', 'phương pháp', 'nhiệt độ', 'thời gian', 'chuẩn bị', 'thời gian nấu',
|
| 33 |
+
'lò nướng', 'bếp', 'vỉ nướng', 'chảo', 'nồi', 'dao', 'cắt',
|
| 34 |
+
'chay', 'thuần chay', 'không gluten', 'không sữa', 'keto', 'paleo',
|
| 35 |
+
'khai vị', 'món chính', 'tráng miệng', 'sáng', 'trưa', 'tối',
|
| 36 |
+
'ẩm thực', 'ý', 'trung', 'mexico', 'pháp', 'ấn', 'thái',
|
| 37 |
+
'thay thế', 'biến tấu', 'sửa đổi', 'thích ứng',
|
| 38 |
+
'khắc phục', 'mẹo', 'thủ thuật', 'lỗi thường gặp'
|
| 39 |
],
|
| 40 |
'zh': [
|
| 41 |
+
'食谱', '烹饪', '烘焙', '烤', '烧烤', '炸', '煮', '蒸',
|
| 42 |
+
'食材', '调料', '香料', '香草', '酱汁', '腌料', '调料',
|
| 43 |
+
'技巧', '方法', '温度', '时间', '准备', '烹饪时间',
|
| 44 |
+
'烤箱', '炉灶', '烤架', '平底锅', '锅', '刀', '切',
|
| 45 |
+
'素食', '纯素', '无麸质', '无乳制品', '生酮', '古法',
|
| 46 |
+
'开胃菜', '主菜', '甜点', '早餐', '午餐', '晚餐',
|
| 47 |
+
'菜系', '意大利', '中国', '墨西哥', '法国', '印度', '泰国',
|
| 48 |
+
'替代', '变化', '修改', '适应',
|
| 49 |
+
'故障排除', '技巧', '窍门', '常见错误'
|
| 50 |
]
|
| 51 |
}
|
| 52 |
|
| 53 |
# Language-specific search enhancements
|
| 54 |
self.language_enhancements = {
|
| 55 |
'vi': {
|
| 56 |
+
'common_terms': ['là gì', 'cách nấu', 'công thức', 'nguyên liệu'],
|
| 57 |
+
'cooking_context': ['nấu ăn', 'ẩm thực', 'bếp', 'đầu bếp']
|
| 58 |
},
|
| 59 |
'zh': {
|
| 60 |
+
'common_terms': ['是什么', '怎么做', '食谱', '食材'],
|
| 61 |
+
'cooking_context': ['烹饪', '美食', '厨房', '厨师']
|
| 62 |
},
|
| 63 |
'en': {
|
| 64 |
+
'common_terms': ['what is', 'how to cook', 'recipe', 'ingredients'],
|
| 65 |
+
'cooking_context': ['cooking', 'culinary', 'kitchen', 'chef']
|
| 66 |
}
|
| 67 |
}
|
| 68 |
|
|
|
|
| 138 |
"""Enhance query for a specific language"""
|
| 139 |
enhancements = self.language_enhancements.get(language, {})
|
| 140 |
common_terms = enhancements.get('common_terms', [])
|
| 141 |
+
cooking_context = enhancements.get('cooking_context', [])
|
| 142 |
|
| 143 |
+
# Check if query already contains cooking context
|
| 144 |
query_lower = query.lower()
|
| 145 |
+
has_cooking_context = any(term in query_lower for term in cooking_context)
|
| 146 |
|
| 147 |
+
# If no cooking context, add it
|
| 148 |
+
if not has_cooking_context and cooking_context:
|
| 149 |
+
# Add the most relevant cooking context term
|
| 150 |
+
query += f" {cooking_context[0]}"
|
| 151 |
|
| 152 |
# Check if query is a question and add relevant terms
|
| 153 |
if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
|
|
|
|
| 157 |
return query.strip()
|
| 158 |
|
| 159 |
def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
|
| 160 |
+
"""Simple keyword-based translation for cooking terms"""
|
| 161 |
# This is a basic implementation - in production, you'd use a proper translation service
|
| 162 |
|
| 163 |
+
# Cooking term translations
|
| 164 |
translations = {
|
| 165 |
('vi', 'en'): {
|
| 166 |
+
'công thức': 'recipe',
|
| 167 |
+
'nấu ăn': 'cooking',
|
| 168 |
+
'nguyên liệu': 'ingredients',
|
| 169 |
+
'gia vị': 'seasoning',
|
| 170 |
+
'kỹ thuật': 'technique',
|
| 171 |
+
'nướng': 'baking',
|
| 172 |
+
'chiên': 'frying',
|
| 173 |
+
'luộc': 'boiling',
|
| 174 |
+
'hấp': 'steaming',
|
| 175 |
+
'nước sốt': 'sauce'
|
|
|
|
| 176 |
},
|
| 177 |
('zh', 'en'): {
|
| 178 |
+
'食谱': 'recipe',
|
| 179 |
+
'烹饪': 'cooking',
|
| 180 |
+
'食材': 'ingredients',
|
| 181 |
+
'调料': 'seasoning',
|
| 182 |
+
'技巧': 'technique',
|
| 183 |
+
'烘焙': 'baking',
|
| 184 |
+
'炸': 'frying',
|
| 185 |
+
'煮': 'boiling',
|
| 186 |
+
'蒸': 'steaming',
|
| 187 |
+
'酱汁': 'sauce'
|
|
|
|
| 188 |
},
|
| 189 |
('en', 'vi'): {
|
| 190 |
+
'recipe': 'công thức',
|
| 191 |
+
'cooking': 'nấu ăn',
|
| 192 |
+
'ingredients': 'nguyên liệu',
|
| 193 |
+
'seasoning': 'gia vị',
|
| 194 |
+
'technique': 'kỹ thuật',
|
| 195 |
+
'baking': 'nướng',
|
| 196 |
+
'frying': 'chiên',
|
| 197 |
+
'boiling': 'luộc',
|
| 198 |
+
'steaming': 'hấp',
|
| 199 |
+
'sauce': 'nước sốt'
|
|
|
|
| 200 |
},
|
| 201 |
('en', 'zh'): {
|
| 202 |
+
'recipe': '食谱',
|
| 203 |
+
'cooking': '烹饪',
|
| 204 |
+
'ingredients': '食材',
|
| 205 |
+
'seasoning': '调料',
|
| 206 |
+
'technique': '技巧',
|
| 207 |
+
'baking': '烘焙',
|
| 208 |
+
'frying': '炸',
|
| 209 |
+
'boiling': '煮',
|
| 210 |
+
'steaming': '蒸',
|
| 211 |
+
'sauce': '酱汁'
|
|
|
|
| 212 |
}
|
| 213 |
}
|
| 214 |
|
|
|
|
| 221 |
|
| 222 |
return translated_query
|
| 223 |
|
| 224 |
+
def get_cooking_relevance_score(self, text: str, language: str) -> float:
|
| 225 |
+
"""Calculate cooking relevance score for text in a specific language"""
|
| 226 |
if not text:
|
| 227 |
return 0.0
|
| 228 |
|
| 229 |
+
keywords = self.cooking_keywords.get(language, [])
|
| 230 |
if not keywords:
|
| 231 |
return 0.0
|
| 232 |
|
search/processors/sources.py
CHANGED
|
@@ -13,9 +13,9 @@ class SourceAggregator:
|
|
| 13 |
# (Removed credibility scoring; keep placeholder map for future use)
|
| 14 |
self.source_credibility = {
|
| 15 |
# English sources
|
| 16 |
-
'
|
| 17 |
-
'
|
| 18 |
-
'
|
| 19 |
'medlineplus.gov': 0.95,
|
| 20 |
'nih.gov': 0.98,
|
| 21 |
'cdc.gov': 0.98,
|
|
@@ -29,7 +29,7 @@ class SourceAggregator:
|
|
| 29 |
'hellobacsi.com': 0.85,
|
| 30 |
'alobacsi.com': 0.82,
|
| 31 |
'vinmec.com': 0.88,
|
| 32 |
-
'
|
| 33 |
'medlatec.vn': 0.83,
|
| 34 |
'suckhoedoisong.vn': 0.90,
|
| 35 |
'viendinhduong.vn': 0.87,
|
|
@@ -40,7 +40,7 @@ class SourceAggregator:
|
|
| 40 |
'chunyuyisheng.com': 0.84,
|
| 41 |
'xywy.com': 0.82,
|
| 42 |
'jiankang.com': 0.80,
|
| 43 |
-
'
|
| 44 |
|
| 45 |
# Video platforms
|
| 46 |
'youtube.com': 0.70,
|
|
@@ -50,8 +50,8 @@ class SourceAggregator:
|
|
| 50 |
# Source type classification
|
| 51 |
self.source_types = {
|
| 52 |
'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
|
| 53 |
-
'
|
| 54 |
-
'commercial': ['
|
| 55 |
'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
|
| 56 |
'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
|
| 57 |
'video': ['youtube.com', 'medscape.com']
|
|
@@ -325,7 +325,7 @@ class SourceAggregator:
|
|
| 325 |
# Create type indicator
|
| 326 |
type_icons = {
|
| 327 |
'academic': '🎓',
|
| 328 |
-
'
|
| 329 |
'government': '🏛️',
|
| 330 |
'commercial': '💼',
|
| 331 |
'professional': '👨⚕️',
|
|
|
|
| 13 |
# (Removed credibility scoring; keep placeholder map for future use)
|
| 14 |
self.source_credibility = {
|
| 15 |
# English sources
|
| 16 |
+
'allrecipes.com': 0.95,
|
| 17 |
+
'foodnetwork.com': 0.90,
|
| 18 |
+
'epicurious.com': 0.88,
|
| 19 |
'medlineplus.gov': 0.95,
|
| 20 |
'nih.gov': 0.98,
|
| 21 |
'cdc.gov': 0.98,
|
|
|
|
| 29 |
'hellobacsi.com': 0.85,
|
| 30 |
'alobacsi.com': 0.82,
|
| 31 |
'vinmec.com': 0.88,
|
| 32 |
+
'monngonviet.com': 0.85,
|
| 33 |
'medlatec.vn': 0.83,
|
| 34 |
'suckhoedoisong.vn': 0.90,
|
| 35 |
'viendinhduong.vn': 0.87,
|
|
|
|
| 40 |
'chunyuyisheng.com': 0.84,
|
| 41 |
'xywy.com': 0.82,
|
| 42 |
'jiankang.com': 0.80,
|
| 43 |
+
'xiachufang.com': 0.85,
|
| 44 |
|
| 45 |
# Video platforms
|
| 46 |
'youtube.com': 0.70,
|
|
|
|
| 50 |
# Source type classification
|
| 51 |
self.source_types = {
|
| 52 |
'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
|
| 53 |
+
'cooking_sites': ['allrecipes.com', 'foodnetwork.com', 'epicurious.com'],
|
| 54 |
+
'commercial': ['seriouseats.com', 'bonappetit.com', 'tasteofhome.com'],
|
| 55 |
'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
|
| 56 |
'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
|
| 57 |
'video': ['youtube.com', 'medscape.com']
|
|
|
|
| 325 |
# Create type indicator
|
| 326 |
type_icons = {
|
| 327 |
'academic': '🎓',
|
| 328 |
+
'cooking_sites': '🍳',
|
| 329 |
'government': '🏛️',
|
| 330 |
'commercial': '💼',
|
| 331 |
'professional': '👨⚕️',
|
search/search.py
CHANGED
|
@@ -4,6 +4,7 @@ import time
|
|
| 4 |
import hashlib
|
| 5 |
from .engines.duckduckgo import DuckDuckGoEngine
|
| 6 |
from .engines.video import VideoSearchEngine
|
|
|
|
| 7 |
from .coordinator import SearchCoordinator
|
| 8 |
# Reranker removed - using simple relevance scoring for cooking content
|
| 9 |
from models import summarizer
|
|
@@ -13,6 +14,7 @@ logger = logging.getLogger(__name__)
|
|
| 13 |
# Global instances
|
| 14 |
_duckduckgo_engine = None
|
| 15 |
_video_engine = None
|
|
|
|
| 16 |
_reranker = None
|
| 17 |
_search_coordinator = None
|
| 18 |
|
|
@@ -34,6 +36,13 @@ def get_video_engine() -> VideoSearchEngine:
|
|
| 34 |
_video_engine = VideoSearchEngine()
|
| 35 |
return _video_engine
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def get_reranker():
|
| 38 |
"""Simple cooking relevance scorer - no complex reranking needed"""
|
| 39 |
return None
|
|
@@ -237,8 +246,20 @@ def search_videos(query: str, num_results: int = 2, target_language: str = None)
|
|
| 237 |
logger.error(f"Video search failed: {e}")
|
| 238 |
return []
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
# Comprehensive search function with maximum information extraction
|
| 241 |
-
def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
|
| 242 |
"""Comprehensive search with maximum information extraction and detailed references"""
|
| 243 |
logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
|
| 244 |
|
|
@@ -299,8 +320,20 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
|
|
| 299 |
except Exception as e:
|
| 300 |
logger.warning(f"Video search failed: {e}")
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
# Combine all results
|
| 303 |
-
all_results = text_results + video_results
|
| 304 |
|
| 305 |
# Simple cooking relevance filtering
|
| 306 |
if all_results:
|
|
@@ -351,7 +384,10 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
|
|
| 351 |
'total_sources': len(all_results),
|
| 352 |
'text_sources': len(text_results),
|
| 353 |
'video_sources': len(video_results),
|
| 354 |
-
'
|
|
|
|
|
|
|
|
|
|
| 355 |
}
|
| 356 |
|
| 357 |
logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
|
|
|
|
| 4 |
import hashlib
|
| 5 |
from .engines.duckduckgo import DuckDuckGoEngine
|
| 6 |
from .engines.video import VideoSearchEngine
|
| 7 |
+
from .engines.image import ImageSearchEngine
|
| 8 |
from .coordinator import SearchCoordinator
|
| 9 |
# Reranker removed - using simple relevance scoring for cooking content
|
| 10 |
from models import summarizer
|
|
|
|
| 14 |
# Global instances
|
| 15 |
_duckduckgo_engine = None
|
| 16 |
_video_engine = None
|
| 17 |
+
_image_engine = None
|
| 18 |
_reranker = None
|
| 19 |
_search_coordinator = None
|
| 20 |
|
|
|
|
| 36 |
_video_engine = VideoSearchEngine()
|
| 37 |
return _video_engine
|
| 38 |
|
| 39 |
+
def get_image_engine() -> ImageSearchEngine:
|
| 40 |
+
"""Get or create the global image engine instance"""
|
| 41 |
+
global _image_engine
|
| 42 |
+
if _image_engine is None:
|
| 43 |
+
_image_engine = ImageSearchEngine()
|
| 44 |
+
return _image_engine
|
| 45 |
+
|
| 46 |
def get_reranker():
|
| 47 |
"""Simple cooking relevance scorer - no complex reranking needed"""
|
| 48 |
return None
|
|
|
|
| 246 |
logger.error(f"Video search failed: {e}")
|
| 247 |
return []
|
| 248 |
|
| 249 |
+
# Image search function
|
| 250 |
+
def search_images(query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
|
| 251 |
+
"""Search for cooking-related images"""
|
| 252 |
+
try:
|
| 253 |
+
# Clean the query first
|
| 254 |
+
cleaned_query = _clean_search_query(query)
|
| 255 |
+
coordinator = get_search_coordinator()
|
| 256 |
+
return coordinator.image_search(cleaned_query, num_results, target_language)
|
| 257 |
+
except Exception as e:
|
| 258 |
+
logger.error(f"Image search failed: {e}")
|
| 259 |
+
return []
|
| 260 |
+
|
| 261 |
# Comprehensive search function with maximum information extraction
|
| 262 |
+
def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True, include_images: bool = True) -> Tuple[str, Dict[int, str], Dict]:
|
| 263 |
"""Comprehensive search with maximum information extraction and detailed references"""
|
| 264 |
logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
|
| 265 |
|
|
|
|
| 320 |
except Exception as e:
|
| 321 |
logger.warning(f"Video search failed: {e}")
|
| 322 |
|
| 323 |
+
# Search for images if requested
|
| 324 |
+
image_results = []
|
| 325 |
+
if include_images:
|
| 326 |
+
try:
|
| 327 |
+
image_engine = get_image_engine()
|
| 328 |
+
# Limit image results to avoid over-fetching
|
| 329 |
+
max_image_results = min(3, num_results // 5) # Max 3 or 1/5 of total
|
| 330 |
+
image_results = image_engine.search_cooking_images(boosted_query, max_image_results, search_language)
|
| 331 |
+
logger.info(f"Found {len(image_results)} image results")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
logger.warning(f"Image search failed: {e}")
|
| 334 |
+
|
| 335 |
# Combine all results
|
| 336 |
+
all_results = text_results + video_results + image_results
|
| 337 |
|
| 338 |
# Simple cooking relevance filtering
|
| 339 |
if all_results:
|
|
|
|
| 384 |
'total_sources': len(all_results),
|
| 385 |
'text_sources': len(text_results),
|
| 386 |
'video_sources': len(video_results),
|
| 387 |
+
'image_sources': len(image_results),
|
| 388 |
+
'sources': all_results,
|
| 389 |
+
'videos': video_results,
|
| 390 |
+
'images': image_results
|
| 391 |
}
|
| 392 |
|
| 393 |
logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
|