Spaces:

BinKhoaLe1812
/

Cooking_Tutor

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 18

Commit

3685b45

1 Parent(s): 4bc06b1

Upd cooking-specs services. Upd multilinguability procesors

Browse files

Files changed (11) hide show

api/chatbot.py +23 -10
api/routes.py +7 -1
search/coordinator.py +28 -1
search/engines/duckduckgo.py +3 -3
search/engines/image.py +231 -0
search/engines/video.py +18 -27
search/processors/cooking.py +7 -7
search/processors/enhanced.py +3 -3
search/processors/language.py +88 -90
search/processors/sources.py +8 -8
search/search.py +39 -3

api/chatbot.py CHANGED Viewed

@@ -7,7 +7,6 @@ from .config import gemini_flash_api_key
 from memory import MemoryManager
 from utils import translate_query
 from search import search_comprehensive
-# Safety guard removed - cooking tutor doesn't need medical safety checks
 logger = logging.getLogger("cooking-tutor")
@@ -66,9 +65,8 @@ class CookingTutorChatbot:
         cuisine: str = None,
         structured: bool = False,
     ) -> str:
-        # Translate to English-centric search if needed
-        if lang.upper() in {"VI", "ZH"}:
-            user_query = translate_query(user_query, lang.lower())
         # Basic cooking relevance check
         cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
@@ -88,11 +86,13 @@ class CookingTutorChatbot:
         if search_mode:
             try:
                 search_context, url_mapping, source_aggregation = search_comprehensive(
-                    f"cooking technique tutorial: {user_query}",
                     num_results=12,
                     target_language=lang,
-                    include_videos=bool(video_mode)
                 )
                 if video_mode and source_aggregation:
                     video_results = source_aggregation.get('sources', []) or []
@@ -170,11 +170,24 @@ class CookingTutorChatbot:
         if user_id:
             self.memory.add_exchange(user_id, user_query, response, lang=lang)
         if video_mode and video_results:
-            return {
-                'text': response.strip(),
-                'videos': video_results
-            }
         return response.strip()
     def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:

 from memory import MemoryManager
 from utils import translate_query
 from search import search_comprehensive
 logger = logging.getLogger("cooking-tutor")
         cuisine: str = None,
         structured: bool = False,
     ) -> str:
+        # Keep original language for native search - no translation needed
+        # The search engines now support native language sources
         # Basic cooking relevance check
         cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
         if search_mode:
             try:
+                # Use native language search for better results
                 search_context, url_mapping, source_aggregation = search_comprehensive(
+                    user_query,  # Use original query without English prefix
                     num_results=12,
                     target_language=lang,
+                    include_videos=bool(video_mode),
+                    include_images=True  # Always include images for visual appeal
                 )
                 if video_mode and source_aggregation:
                     video_results = source_aggregation.get('sources', []) or []
         if user_id:
             self.memory.add_exchange(user_id, user_query, response, lang=lang)
+        # Prepare response with media
+        response_data = {
+            'text': response.strip()
+        }
+        # Add videos if available
         if video_mode and video_results:
+            response_data['videos'] = video_results
+        # Add images if available
+        if source_aggregation and 'images' in source_aggregation:
+            images = source_aggregation['images']
+            if images:
+                response_data['images'] = images[:3]  # Limit to 3 images
+        # Return structured response if we have media, otherwise just text
+        if len(response_data) > 1:
+            return response_data
         return response.strip()
     def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:

api/routes.py CHANGED Viewed

@@ -59,13 +59,15 @@ async def chat_endpoint(req: Request):
         )
         elapsed = time.time() - start
-        # Handle response format (might be string or dict with videos)
         if isinstance(answer, dict):
             response_text = answer.get('text', '')
             video_data = answer.get('videos', [])
         else:
             response_text = answer
             video_data = []
         # Final response
         response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
@@ -74,6 +76,10 @@ async def chat_endpoint(req: Request):
         if video_data:
             response_data["videos"] = video_data
         return JSONResponse(response_data)
     except Exception as e:

         )
         elapsed = time.time() - start
+        # Handle response format (might be string or dict with videos/images)
         if isinstance(answer, dict):
             response_text = answer.get('text', '')
             video_data = answer.get('videos', [])
+            image_data = answer.get('images', [])
         else:
             response_text = answer
             video_data = []
+            image_data = []
         # Final response
         response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
         if video_data:
             response_data["videos"] = video_data
+        # Include image data if available
+        if image_data:
+            response_data["images"] = image_data
         return JSONResponse(response_data)
     except Exception as e:

search/coordinator.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .engines.duckduckgo import DuckDuckGoEngine
 from .engines.cooking import CookingSearchEngine
 from .engines.multilingual import MultilingualCookingEngine
 from .engines.video import VideoSearchEngine
 from .extractors.content import ContentExtractor
 from .processors.cooking import CookingSearchProcessor
 from .processors.language import LanguageProcessor
@@ -27,6 +28,7 @@ class SearchCoordinator:
         self.cooking_engine = CookingSearchEngine()
         self.multilingual_engine = MultilingualCookingEngine()
         self.video_engine = VideoSearchEngine()
         # Initialize processors
         self.content_extractor = ContentExtractor()
@@ -105,7 +107,7 @@ class SearchCoordinator:
         return summary, url_mapping
     def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
-        """Search using multilingual medical engine"""
         try:
             if language:
                 results = self.multilingual_engine.search_by_language(query, language, num_results)
@@ -454,6 +456,31 @@ class SearchCoordinator:
         logger.info(f"Video search completed: {len(video_results)} videos found")
         return video_results
     def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
         """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.

 from .engines.cooking import CookingSearchEngine
 from .engines.multilingual import MultilingualCookingEngine
 from .engines.video import VideoSearchEngine
+from .engines.image import ImageSearchEngine
 from .extractors.content import ContentExtractor
 from .processors.cooking import CookingSearchProcessor
 from .processors.language import LanguageProcessor
         self.cooking_engine = CookingSearchEngine()
         self.multilingual_engine = MultilingualCookingEngine()
         self.video_engine = VideoSearchEngine()
+        self.image_engine = ImageSearchEngine()
         # Initialize processors
         self.content_extractor = ContentExtractor()
         return summary, url_mapping
     def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
+        """Search using multilingual cooking engine"""
         try:
             if language:
                 results = self.multilingual_engine.search_by_language(query, language, num_results)
         logger.info(f"Video search completed: {len(video_results)} videos found")
         return video_results
+    def image_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
+        """Search for cooking-related images"""
+        logger.info(f"Image search for: {query} (target: {target_language})")
+        # Detect language if not provided
+        if not target_language:
+            target_language = self.language_processor.detect_language(query)
+        # Map language codes
+        lang_mapping = {
+            'EN': 'en',
+            'VI': 'vi',
+            'ZH': 'zh',
+            'en': 'en',
+            'vi': 'vi',
+            'zh': 'zh'
+        }
+        search_language = lang_mapping.get(target_language, 'en')
+        # Search for images
+        image_results = self.image_engine.search_cooking_images(query, num_results, search_language)
+        logger.info(f"Image search completed: {len(image_results)} images found")
+        return image_results
     def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
         """Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.

search/engines/duckduckgo.py CHANGED Viewed

@@ -143,11 +143,11 @@ class DuckDuckGoEngine:
             return ' '.join(words[:3])  # Max 3 words
     def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
-        """Filter out irrelevant sources like generic health pages, quizzes, etc."""
         import re
         filtered = []
-        # Only exclude obvious non-medical content
         exclude_patterns = [
             r'/quiz$',  # Quiz pages (end of URL)
             r'/test$',  # Test pages (end of URL)
@@ -325,7 +325,7 @@ class DuckDuckGoEngine:
                 'format': 'json',
                 'no_html': '1',
                 'skip_disambig': '1',
-                't': 'MedicalChatbot'
             }
             response = self.session.get(url, params=params, timeout=self.timeout)

             return ' '.join(words[:3])  # Max 3 words
     def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
+        """Filter out irrelevant sources like generic pages, quizzes, etc."""
         import re
         filtered = []
+        # Only exclude obvious non-cooking content
         exclude_patterns = [
             r'/quiz$',  # Quiz pages (end of URL)
             r'/test$',  # Test pages (end of URL)
                 'format': 'json',
                 'no_html': '1',
                 'skip_disambig': '1',
+                't': 'CookingTutor'
             }
             response = self.session.get(url, params=params, timeout=self.timeout)

search/engines/image.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import requests
+from bs4 import BeautifulSoup
+import logging
+from typing import List, Dict
+import time
+import re
+logger = logging.getLogger(__name__)
+class ImageSearchEngine:
+    """Search engine for cooking-related images"""
+    def __init__(self, timeout: int = 15):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        })
+        self.timeout = timeout
+    def search_cooking_images(self, query: str, num_results: int = 3, language: str = "en") -> List[Dict]:
+        """Search for cooking-related images"""
+        results = []
+        # Try multiple image search strategies
+        strategies = [
+            self._search_google_images,
+            self._search_bing_images,
+            self._search_unsplash
+        ]
+        for strategy in strategies:
+            try:
+                strategy_results = strategy(query, num_results, language)
+                if strategy_results:
+                    results.extend(strategy_results)
+                    logger.info(f"Image search found {len(strategy_results)} results")
+                    if len(results) >= num_results:
+                        break
+            except Exception as e:
+                logger.warning(f"Image search strategy failed: {e}")
+                continue
+        return results[:num_results]
+    def _search_google_images(self, query: str, num_results: int, language: str) -> List[Dict]:
+        """Search Google Images for cooking content"""
+        try:
+            # Add cooking context to improve relevance
+            cooking_query = f"{query} recipe cooking food dish"
+            url = "https://www.google.com/search"
+            params = {
+                'q': cooking_query,
+                'tbm': 'isch',  # Image search
+                'hl': language,
+                'safe': 'active'
+            }
+            response = self.session.get(url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Find image containers
+            image_containers = soup.find_all('div', class_='islrc')
+            for container in image_containers[:num_results]:
+                try:
+                    # Extract image URL
+                    img_tag = container.find('img')
+                    if not img_tag:
+                        continue
+                    img_url = img_tag.get('src') or img_tag.get('data-src')
+                    if not img_url or not img_url.startswith('http'):
+                        continue
+                    # Extract title/alt text
+                    title = img_tag.get('alt', '') or img_tag.get('title', '')
+                    # Extract source URL
+                    link_tag = container.find('a')
+                    source_url = link_tag.get('href', '') if link_tag else ''
+                    results.append({
+                        'url': img_url,
+                        'title': title,
+                        'source_url': source_url,
+                        'source': 'google_images',
+                        'type': 'image'
+                    })
+                except Exception as e:
+                    logger.debug(f"Error parsing Google image: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Google Images search failed: {e}")
+            return []
+    def _search_bing_images(self, query: str, num_results: int, language: str) -> List[Dict]:
+        """Search Bing Images for cooking content"""
+        try:
+            cooking_query = f"{query} recipe cooking food"
+            url = "https://www.bing.com/images/search"
+            params = {
+                'q': cooking_query,
+                'qft': '+filterui:imagesize-large',  # Large images
+                'form': 'HDRSC2',
+                'first': '1',
+                'count': num_results
+            }
+            response = self.session.get(url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Find image containers
+            image_containers = soup.find_all('div', class_='img_cont')
+            for container in image_containers[:num_results]:
+                try:
+                    img_tag = container.find('img')
+                    if not img_tag:
+                        continue
+                    img_url = img_tag.get('src') or img_tag.get('data-src')
+                    if not img_url or not img_url.startswith('http'):
+                        continue
+                    title = img_tag.get('alt', '') or img_tag.get('title', '')
+                    results.append({
+                        'url': img_url,
+                        'title': title,
+                        'source_url': '',
+                        'source': 'bing_images',
+                        'type': 'image'
+                    })
+                except Exception as e:
+                    logger.debug(f"Error parsing Bing image: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Bing Images search failed: {e}")
+            return []
+    def _search_unsplash(self, query: str, num_results: int, language: str) -> List[Dict]:
+        """Search Unsplash for high-quality cooking images"""
+        try:
+            cooking_query = f"{query} food cooking recipe"
+            url = "https://unsplash.com/s/photos/" + cooking_query.replace(' ', '-')
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Find image containers
+            image_containers = soup.find_all('figure')
+            for container in image_containers[:num_results]:
+                try:
+                    img_tag = container.find('img')
+                    if not img_tag:
+                        continue
+                    img_url = img_tag.get('src') or img_tag.get('data-src')
+                    if not img_url or not img_url.startswith('http'):
+                        continue
+                    title = img_tag.get('alt', '') or img_tag.get('title', '')
+                    # Get source URL
+                    link_tag = container.find('a')
+                    source_url = link_tag.get('href', '') if link_tag else ''
+                    if source_url and not source_url.startswith('http'):
+                        source_url = 'https://unsplash.com' + source_url
+                    results.append({
+                        'url': img_url,
+                        'title': title,
+                        'source_url': source_url,
+                        'source': 'unsplash',
+                        'type': 'image'
+                    })
+                except Exception as e:
+                    logger.debug(f"Error parsing Unsplash image: {e}")
+                    continue
+            return results
+        except Exception as e:
+            logger.warning(f"Unsplash search failed: {e}")
+            return []
+    def _filter_cooking_relevance(self, images: List[Dict], query: str) -> List[Dict]:
+        """Filter images for cooking relevance"""
+        cooking_keywords = [
+            'food', 'cooking', 'recipe', 'dish', 'meal', 'ingredient', 'kitchen',
+            'chef', 'bake', 'cook', 'preparation', 'cuisine', 'delicious', 'tasty'
+        ]
+        relevant_images = []
+        query_lower = query.lower()
+        for image in images:
+            title = image.get('title', '').lower()
+            # Check if title contains cooking keywords or query terms
+            is_relevant = (
+                any(keyword in title for keyword in cooking_keywords) or
+                any(word in title for word in query_lower.split() if len(word) > 3)
+            )
+            if is_relevant:
+                relevant_images.append(image)
+        return relevant_images

search/engines/video.py CHANGED Viewed

@@ -50,13 +50,6 @@ class VideoSearchEngine:
                     'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
                     'base_url': 'https://www.youtube.com'
                 },
-                {
-                    'name': 'vinmec_videos',
-                    'search_url': 'https://www.vinmec.com/vi/tim-kiem',
-                    'params': {'q': ''},
-                    'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
-                    'base_url': 'https://www.vinmec.com'
-                }
             ],
             'zh': [
                 {
@@ -87,8 +80,8 @@ class VideoSearchEngine:
         q = re.sub(r"\s+", " ", q)
         return q.strip()
-    def _is_valid_medical_video(self, result: Dict, query: str) -> bool:
-        """Check if video is medically relevant and has valid URL"""
         url = result.get('url', '')
         title = result.get('title', '')
@@ -96,25 +89,23 @@ class VideoSearchEngine:
         if 'results?search_query=' in url:
             return False
-        # Skip non-YouTube URLs that aren't medical platforms
-        if 'youtube.com' not in url and not any(med in url for med in ['medscape.com', 'vinmec.com', 'haodf.com']):
             return False
-        # Check if title contains medical keywords or query terms
         title_lower = title.lower()
         query_lower = query.lower()
-        medical_keywords = [
-            'medical', 'health', 'doctor', 'treatment', 'diagnosis',
-            'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
-            'disease', 'condition', 'healthcare', 'physician'
         ]
-        # Must contain medical keywords or query terms
-        has_medical = any(keyword in title_lower for keyword in medical_keywords)
         has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
-        return has_medical or has_query
     def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
         """Search platform with retry logic and better error handling"""
@@ -130,9 +121,9 @@ class VideoSearchEngine:
         return []
     def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
-        """Search for medical videos across platforms with deduplication and medical filtering"""
         query = self._normalize_query(query)
-        logger.info(f"Searching for medical videos: {query} (language: {language})")
         results = []
         seen_urls = set()  # Track URLs to avoid duplicates
@@ -152,7 +143,7 @@ class VideoSearchEngine:
                     logger.warning(f"No results from {platform['name']}")
                     continue
-                # Filter out duplicates and non-medical content
                 for result in platform_results:
                     url = result.get('url', '')
                     video_id = self._extract_video_id(url)
@@ -161,8 +152,8 @@ class VideoSearchEngine:
                     if url in seen_urls or (video_id and video_id in seen_video_ids):
                         continue
-                    # Check if it's a valid medical video (less strict for more results)
-                    if self._is_valid_medical_video(result, query):
                         seen_urls.add(url)
                         if video_id:
                             seen_video_ids.add(video_id)
@@ -192,7 +183,7 @@ class VideoSearchEngine:
                     if (url not in seen_urls and
                         video_id not in seen_video_ids and
-                        self._is_valid_medical_video(result, query)):
                         seen_urls.add(url)
                         if video_id:
                             seen_video_ids.add(video_id)
@@ -373,8 +364,8 @@ class VideoSearchEngine:
         fallback_videos = {
             'en': [
                 {
-                    'url': 'https://www.youtube.com/results?search_query=medical+' + quote(query),
-                    'title': f'Medical Videos: {query}',
                     'platform': 'youtube_fallback',
                     'type': 'video',
                     'source': 'youtube'

                     'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
                     'base_url': 'https://www.youtube.com'
                 },
             ],
             'zh': [
                 {
         q = re.sub(r"\s+", " ", q)
         return q.strip()
+    def _is_valid_cooking_video(self, result: Dict, query: str) -> bool:
+        """Check if video is cooking-relevant and has valid URL"""
         url = result.get('url', '')
         title = result.get('title', '')
         if 'results?search_query=' in url:
             return False
+        # Skip non-YouTube URLs that aren't cooking platforms
+        if 'youtube.com' not in url and not any(cook in url for cook in ['allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com']):
             return False
+        # Check if title contains cooking keywords or query terms
         title_lower = title.lower()
         query_lower = query.lower()
+        cooking_keywords = [
+            'recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner'
         ]
+        # Must contain cooking keywords or query terms
+        has_cooking = any(keyword in title_lower for keyword in cooking_keywords)
         has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
+        return has_cooking or has_query
     def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
         """Search platform with retry logic and better error handling"""
         return []
     def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
+        """Search for cooking videos across platforms with deduplication and cooking filtering"""
         query = self._normalize_query(query)
+        logger.info(f"Searching for cooking videos: {query} (language: {language})")
         results = []
         seen_urls = set()  # Track URLs to avoid duplicates
                     logger.warning(f"No results from {platform['name']}")
                     continue
+                # Filter out duplicates and non-cooking content
                 for result in platform_results:
                     url = result.get('url', '')
                     video_id = self._extract_video_id(url)
                     if url in seen_urls or (video_id and video_id in seen_video_ids):
                         continue
+                    # Check if it's a valid cooking video (less strict for more results)
+                    if self._is_valid_cooking_video(result, query):
                         seen_urls.add(url)
                         if video_id:
                             seen_video_ids.add(video_id)
                     if (url not in seen_urls and
                         video_id not in seen_video_ids and
+                        self._is_valid_cooking_video(result, query)):
                         seen_urls.add(url)
                         if video_id:
                             seen_video_ids.add(video_id)
         fallback_videos = {
             'en': [
                 {
+                    'url': 'https://www.youtube.com/results?search_query=cooking+' + quote(query),
+                    'title': f'Cooking Videos: {query}',
                     'platform': 'youtube_fallback',
                     'type': 'video',
                     'source': 'youtube'

search/processors/cooking.py CHANGED Viewed

@@ -30,7 +30,7 @@ class CookingSearchProcessor:
             ],
             'dietary': [
                 'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
-                'healthy', 'low-carb', 'low-fat', 'protein', 'fiber'
             ],
             'meal_types': [
                 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
@@ -272,12 +272,12 @@ class CookingSearchProcessor:
         # Add topic header
         topic_headers = {
-            'recipes': "**🍳 Recipes and Instructions:**",
-            'techniques': "**👨‍🍳 Cooking Techniques:**",
-            'ingredients': "**🥘 Ingredients and Substitutions:**",
-            'equipment': "**🔪 Equipment and Tools:**",
-            'tips_tricks': "**💡 Tips and Tricks:**",
-            'general': "**📚 General Information:**"
         }
         header = topic_headers.get(topic, "**Information:**")

             ],
             'dietary': [
                 'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
+                'healthy', 'low-carb', 'low-fat', 'protein', 'fiber', 'nutritious', 'balanced'
             ],
             'meal_types': [
                 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
         # Add topic header
         topic_headers = {
+            'recipes': "**Recipes and Instructions:**",
+            'techniques': "**Cooking Techniques:**",
+            'ingredients': "**Ingredients and Substitutions:**",
+            'equipment': "**Equipment and Tools:**",
+            'tips_tricks': "**Tips and Tricks:**",
+            'general': "**General Information:**"
         }
         header = topic_headers.get(topic, "**Information:**")

search/processors/enhanced.py CHANGED Viewed

@@ -92,7 +92,7 @@ class EnhancedContentProcessor:
         return comprehensive_summary, reference_mapping
     def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
-        """Extract structured information by medical categories"""
         structured_info = defaultdict(list)
         for source in sources:
@@ -100,8 +100,8 @@ class EnhancedContentProcessor:
             if not content:
                 continue
-            # Extract information by medical categories
-            for category, patterns in self.medical_patterns.items():
                 extracted_info = self._extract_category_info(content, patterns, category, user_query)
                 if extracted_info:
                     structured_info[category].append({

         return comprehensive_summary, reference_mapping
     def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
+        """Extract structured information by cooking categories"""
         structured_info = defaultdict(list)
         for source in sources:
             if not content:
                 continue
+            # Extract information by cooking categories
+            for category, patterns in self.cooking_patterns.items():
                 extracted_info = self._extract_category_info(content, patterns, category, user_query)
                 if extracted_info:
                     structured_info[category].append({

search/processors/language.py CHANGED Viewed

@@ -10,57 +10,59 @@ logger = logging.getLogger(__name__)
 DetectorFactory.seed = 0
 class LanguageProcessor:
-    """Process and enhance queries for multilingual medical search"""
     def __init__(self):
-        # Medical keywords in different languages
-        self.medical_keywords = {
             'en': [
-                'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
-                'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
-                'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
-                'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
-                'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
-                'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
-                'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
-                'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
-                'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
             ],
             'vi': [
-                'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
-                'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
-                'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
-                'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
-                'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
-                'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
-                'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
-                'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
-                'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
             ],
             'zh': [
-                '症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
-                '药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
-                '医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
-                '背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
-                '睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
-                '婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
-                '手术', '医院', '诊所'
             ]
         }
         # Language-specific search enhancements
         self.language_enhancements = {
             'vi': {
-                'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
-                'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
             },
             'zh': {
-                'common_terms': ['是什么', '原因', '治疗方法', '症状'],
-                'medical_context': ['医疗', '健康', '医院', '医生']
             },
             'en': {
-                'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
-                'medical_context': ['medical', 'health', 'hospital', 'doctor']
             }
         }
@@ -136,16 +138,16 @@ class LanguageProcessor:
         """Enhance query for a specific language"""
         enhancements = self.language_enhancements.get(language, {})
         common_terms = enhancements.get('common_terms', [])
-        medical_context = enhancements.get('medical_context', [])
-        # Check if query already contains medical context
         query_lower = query.lower()
-        has_medical_context = any(term in query_lower for term in medical_context)
-        # If no medical context, add it
-        if not has_medical_context and medical_context:
-            # Add the most relevant medical context term
-            query += f" {medical_context[0]}"
         # Check if query is a question and add relevant terms
         if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
@@ -155,62 +157,58 @@ class LanguageProcessor:
         return query.strip()
     def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
-        """Simple keyword-based translation for medical terms"""
         # This is a basic implementation - in production, you'd use a proper translation service
-        # Medical term translations
         translations = {
             ('vi', 'en'): {
-                'triệu chứng': 'symptoms',
-                'đau': 'pain',
-                'đau đầu': 'headache',
-                'sốt': 'fever',
-                'ho': 'cough',
-                'điều trị': 'treatment',
-                'thuốc': 'medicine',
-                'bệnh': 'disease',
-                'bác sĩ': 'doctor',
-                'sức khỏe': 'health',
-                'bệnh viện': 'hospital'
             },
             ('zh', 'en'): {
-                '症状': 'symptoms',
-                '疼痛': 'pain',
-                '头痛': 'headache',
-                '发烧': 'fever',
-                '咳嗽': 'cough',
-                '治疗': 'treatment',
-                '药物': 'medicine',
-                '疾病': 'disease',
-                '医生': 'doctor',
-                '健康': 'health',
-                '医院': 'hospital'
             },
             ('en', 'vi'): {
-                'symptoms': 'triệu chứng',
-                'pain': 'đau',
-                'headache': 'đau đầu',
-                'fever': 'sốt',
-                'cough': 'ho',
-                'treatment': 'điều trị',
-                'medicine': 'thuốc',
-                'disease': 'bệnh',
-                'doctor': 'bác sĩ',
-                'health': 'sức khỏe',
-                'hospital': 'bệnh viện'
             },
             ('en', 'zh'): {
-                'symptoms': '症状',
-                'pain': '疼痛',
-                'headache': '头痛',
-                'fever': '发烧',
-                'cough': '咳嗽',
-                'treatment': '治疗',
-                'medicine': '药物',
-                'disease': '疾病',
-                'doctor': '医生',
-                'health': '健康',
-                'hospital': '医院'
             }
         }
@@ -223,12 +221,12 @@ class LanguageProcessor:
         return translated_query
-    def get_medical_relevance_score(self, text: str, language: str) -> float:
-        """Calculate medical relevance score for text in a specific language"""
         if not text:
             return 0.0
-        keywords = self.medical_keywords.get(language, [])
         if not keywords:
             return 0.0

 DetectorFactory.seed = 0
 class LanguageProcessor:
+    """Process and enhance queries for multilingual cooking search"""
     def __init__(self):
+        # Cooking keywords in different languages
+        self.cooking_keywords = {
             'en': [
+                'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
+                'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
+                'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
+                'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
+                'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
+                'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
+                'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
+                'substitution', 'alternative', 'variation', 'modification', 'adaptation',
+                'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
             ],
             'vi': [
+                'công thức', 'nấu ăn', 'nướng', 'rang', 'nướng vỉ', 'chiên', 'luộc', 'hấp',
+                'nguyên liệu', 'gia vị', 'thảo mộc', 'nước sốt', 'tẩm ướp', 'dressing',
+                'kỹ thuật', 'phương pháp', 'nhiệt độ', 'thời gian', 'chuẩn bị', 'thời gian nấu',
+                'lò nướng', 'bếp', 'vỉ nướng', 'chảo', 'nồi', 'dao', 'cắt',
+                'chay', 'thuần chay', 'không gluten', 'không sữa', 'keto', 'paleo',
+                'khai vị', 'món chính', 'tráng miệng', 'sáng', 'trưa', 'tối',
+                'ẩm thực', 'ý', 'trung', 'mexico', 'pháp', 'ấn', 'thái',
+                'thay thế', 'biến tấu', 'sửa đổi', 'thích ứng',
+                'khắc phục', 'mẹo', 'thủ thuật', 'lỗi thường gặp'
             ],
             'zh': [
+                '食谱', '烹饪', '烘焙', '烤', '烧烤', '炸', '煮', '蒸',
+                '食材', '调料', '香料', '香草', '酱汁', '腌料', '调料',
+                '技巧', '方法', '温度', '时间', '准备', '烹饪时间',
+                '烤箱', '炉灶', '烤架', '平底锅', '锅', '刀', '切',
+                '素食', '纯素', '无麸质', '无乳制品', '生酮', '古法',
+                '开胃菜', '主菜', '甜点', '早餐', '午餐', '晚餐',
+                '菜系', '意大利', '中国', '墨西哥', '法国', '印度', '泰国',
+                '替代', '变化', '修改', '适应',
+                '故障排除', '技巧', '窍门', '常见错误'
             ]
         }
         # Language-specific search enhancements
         self.language_enhancements = {
             'vi': {
+                'common_terms': ['là gì', 'cách nấu', 'công thức', 'nguyên liệu'],
+                'cooking_context': ['nấu ăn', 'ẩm thực', 'bếp', 'đầu bếp']
             },
             'zh': {
+                'common_terms': ['是什么', '怎么做', '食谱', '食材'],
+                'cooking_context': ['烹饪', '美食', '厨房', '厨师']
             },
             'en': {
+                'common_terms': ['what is', 'how to cook', 'recipe', 'ingredients'],
+                'cooking_context': ['cooking', 'culinary', 'kitchen', 'chef']
             }
         }
         """Enhance query for a specific language"""
         enhancements = self.language_enhancements.get(language, {})
         common_terms = enhancements.get('common_terms', [])
+        cooking_context = enhancements.get('cooking_context', [])
+        # Check if query already contains cooking context
         query_lower = query.lower()
+        has_cooking_context = any(term in query_lower for term in cooking_context)
+        # If no cooking context, add it
+        if not has_cooking_context and cooking_context:
+            # Add the most relevant cooking context term
+            query += f" {cooking_context[0]}"
         # Check if query is a question and add relevant terms
         if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
         return query.strip()
     def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
+        """Simple keyword-based translation for cooking terms"""
         # This is a basic implementation - in production, you'd use a proper translation service
+        # Cooking term translations
         translations = {
             ('vi', 'en'): {
+                'công thức': 'recipe',
+                'nấu ăn': 'cooking',
+                'nguyên liệu': 'ingredients',
+                'gia vị': 'seasoning',
+                'kỹ thuật': 'technique',
+                'nướng': 'baking',
+                'chiên': 'frying',
+                'luộc': 'boiling',
+                'hấp': 'steaming',
+                'nước sốt': 'sauce'
             },
             ('zh', 'en'): {
+                '食谱': 'recipe',
+                '烹饪': 'cooking',
+                '食材': 'ingredients',
+                '调料': 'seasoning',
+                '技巧': 'technique',
+                '烘焙': 'baking',
+                '炸': 'frying',
+                '煮': 'boiling',
+                '蒸': 'steaming',
+                '酱汁': 'sauce'
             },
             ('en', 'vi'): {
+                'recipe': 'công thức',
+                'cooking': 'nấu ăn',
+                'ingredients': 'nguyên liệu',
+                'seasoning': 'gia vị',
+                'technique': 'kỹ thuật',
+                'baking': 'nướng',
+                'frying': 'chiên',
+                'boiling': 'luộc',
+                'steaming': 'hấp',
+                'sauce': 'nước sốt'
             },
             ('en', 'zh'): {
+                'recipe': '食谱',
+                'cooking': '烹饪',
+                'ingredients': '食材',
+                'seasoning': '调料',
+                'technique': '技巧',
+                'baking': '烘焙',
+                'frying': '炸',
+                'boiling': '煮',
+                'steaming': '蒸',
+                'sauce': '酱汁'
             }
         }
         return translated_query
+    def get_cooking_relevance_score(self, text: str, language: str) -> float:
+        """Calculate cooking relevance score for text in a specific language"""
         if not text:
             return 0.0
+        keywords = self.cooking_keywords.get(language, [])
         if not keywords:
             return 0.0

search/processors/sources.py CHANGED Viewed

@@ -13,9 +13,9 @@ class SourceAggregator:
         # (Removed credibility scoring; keep placeholder map for future use)
         self.source_credibility = {
             # English sources
-            'mayoclinic.org': 0.95,
-            'webmd.com': 0.90,
-            'healthline.com': 0.88,
             'medlineplus.gov': 0.95,
             'nih.gov': 0.98,
             'cdc.gov': 0.98,
@@ -29,7 +29,7 @@ class SourceAggregator:
             'hellobacsi.com': 0.85,
             'alobacsi.com': 0.82,
             'vinmec.com': 0.88,
-            'tamanhhospital.vn': 0.85,
             'medlatec.vn': 0.83,
             'suckhoedoisong.vn': 0.90,
             'viendinhduong.vn': 0.87,
@@ -40,7 +40,7 @@ class SourceAggregator:
             'chunyuyisheng.com': 0.84,
             'xywy.com': 0.82,
             'jiankang.com': 0.80,
-            'familydoctor.com.cn': 0.85,
             # Video platforms
             'youtube.com': 0.70,
@@ -50,8 +50,8 @@ class SourceAggregator:
         # Source type classification
         self.source_types = {
             'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
-            'hospital': ['mayoclinic.org', 'vinmec.com', 'tamanhhospital.vn'],
-            'commercial': ['webmd.com', 'healthline.com', 'hellobacsi.com'],
             'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
             'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
             'video': ['youtube.com', 'medscape.com']
@@ -325,7 +325,7 @@ class SourceAggregator:
             # Create type indicator
             type_icons = {
                 'academic': '🎓',
-                'hospital': '🏥',
                 'government': '🏛️',
                 'commercial': '💼',
                 'professional': '👨‍⚕️',

         # (Removed credibility scoring; keep placeholder map for future use)
         self.source_credibility = {
             # English sources
+            'allrecipes.com': 0.95,
+            'foodnetwork.com': 0.90,
+            'epicurious.com': 0.88,
             'medlineplus.gov': 0.95,
             'nih.gov': 0.98,
             'cdc.gov': 0.98,
             'hellobacsi.com': 0.85,
             'alobacsi.com': 0.82,
             'vinmec.com': 0.88,
+            'monngonviet.com': 0.85,
             'medlatec.vn': 0.83,
             'suckhoedoisong.vn': 0.90,
             'viendinhduong.vn': 0.87,
             'chunyuyisheng.com': 0.84,
             'xywy.com': 0.82,
             'jiankang.com': 0.80,
+            'xiachufang.com': 0.85,
             # Video platforms
             'youtube.com': 0.70,
         # Source type classification
         self.source_types = {
             'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
+            'cooking_sites': ['allrecipes.com', 'foodnetwork.com', 'epicurious.com'],
+            'commercial': ['seriouseats.com', 'bonappetit.com', 'tasteofhome.com'],
             'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
             'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
             'video': ['youtube.com', 'medscape.com']
             # Create type indicator
             type_icons = {
                 'academic': '🎓',
+                'cooking_sites': '🍳',
                 'government': '🏛️',
                 'commercial': '💼',
                 'professional': '👨‍⚕️',

search/search.py CHANGED Viewed

@@ -4,6 +4,7 @@ import time
 import hashlib
 from .engines.duckduckgo import DuckDuckGoEngine
 from .engines.video import VideoSearchEngine
 from .coordinator import SearchCoordinator
 # Reranker removed - using simple relevance scoring for cooking content
 from models import summarizer
@@ -13,6 +14,7 @@ logger = logging.getLogger(__name__)
 # Global instances
 _duckduckgo_engine = None
 _video_engine = None
 _reranker = None
 _search_coordinator = None
@@ -34,6 +36,13 @@ def get_video_engine() -> VideoSearchEngine:
         _video_engine = VideoSearchEngine()
     return _video_engine
 def get_reranker():
     """Simple cooking relevance scorer - no complex reranking needed"""
     return None
@@ -237,8 +246,20 @@ def search_videos(query: str, num_results: int = 2, target_language: str = None)
         logger.error(f"Video search failed: {e}")
         return []
 # Comprehensive search function with maximum information extraction
-def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
     """Comprehensive search with maximum information extraction and detailed references"""
     logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
@@ -299,8 +320,20 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
         except Exception as e:
             logger.warning(f"Video search failed: {e}")
     # Combine all results
-    all_results = text_results + video_results
     # Simple cooking relevance filtering
     if all_results:
@@ -351,7 +384,10 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
         'total_sources': len(all_results),
         'text_sources': len(text_results),
         'video_sources': len(video_results),
-        'sources': all_results
     }
     logger.info(f"Comprehensive search completed: {len(all_results)} total sources")

 import hashlib
 from .engines.duckduckgo import DuckDuckGoEngine
 from .engines.video import VideoSearchEngine
+from .engines.image import ImageSearchEngine
 from .coordinator import SearchCoordinator
 # Reranker removed - using simple relevance scoring for cooking content
 from models import summarizer
 # Global instances
 _duckduckgo_engine = None
 _video_engine = None
+_image_engine = None
 _reranker = None
 _search_coordinator = None
         _video_engine = VideoSearchEngine()
     return _video_engine
+def get_image_engine() -> ImageSearchEngine:
+    """Get or create the global image engine instance"""
+    global _image_engine
+    if _image_engine is None:
+        _image_engine = ImageSearchEngine()
+    return _image_engine
 def get_reranker():
     """Simple cooking relevance scorer - no complex reranking needed"""
     return None
         logger.error(f"Video search failed: {e}")
         return []
+# Image search function
+def search_images(query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
+    """Search for cooking-related images"""
+    try:
+        # Clean the query first
+        cleaned_query = _clean_search_query(query)
+        coordinator = get_search_coordinator()
+        return coordinator.image_search(cleaned_query, num_results, target_language)
+    except Exception as e:
+        logger.error(f"Image search failed: {e}")
+        return []
 # Comprehensive search function with maximum information extraction
+def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True, include_images: bool = True) -> Tuple[str, Dict[int, str], Dict]:
     """Comprehensive search with maximum information extraction and detailed references"""
     logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
         except Exception as e:
             logger.warning(f"Video search failed: {e}")
+    # Search for images if requested
+    image_results = []
+    if include_images:
+        try:
+            image_engine = get_image_engine()
+            # Limit image results to avoid over-fetching
+            max_image_results = min(3, num_results // 5)  # Max 3 or 1/5 of total
+            image_results = image_engine.search_cooking_images(boosted_query, max_image_results, search_language)
+            logger.info(f"Found {len(image_results)} image results")
+        except Exception as e:
+            logger.warning(f"Image search failed: {e}")
     # Combine all results
+    all_results = text_results + video_results + image_results
     # Simple cooking relevance filtering
     if all_results:
         'total_sources': len(all_results),
         'text_sources': len(text_results),
         'video_sources': len(video_results),
+        'image_sources': len(image_results),
+        'sources': all_results,
+        'videos': video_results,
+        'images': image_results
     }
     logger.info(f"Comprehensive search completed: {len(all_results)} total sources")