LiamKhoaLe commited on
Commit
f12a3b4
·
1 Parent(s): 07c35d1

Upd img fetcher

Browse files
Files changed (2) hide show
  1. search/engines/image.py +83 -18
  2. search/search.py +11 -10
search/engines/image.py CHANGED
@@ -18,7 +18,11 @@ class ImageSearchEngine:
18
  self.timeout = timeout
19
 
20
  def search_cooking_images(self, query: str, num_results: int = 3, language: str = "en") -> List[Dict]:
21
- """Search for cooking-related images"""
 
 
 
 
22
  results = []
23
 
24
  # Try multiple image search strategies
@@ -32,15 +36,62 @@ class ImageSearchEngine:
32
  try:
33
  strategy_results = strategy(query, num_results, language)
34
  if strategy_results:
35
- results.extend(strategy_results)
36
- logger.info(f"Image search found {len(strategy_results)} results")
37
- if len(results) >= num_results:
38
- break
 
 
 
39
  except Exception as e:
40
  logger.warning(f"Image search strategy failed: {e}")
41
  continue
42
 
43
- return results[:num_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def _search_google_images(self, query: str, num_results: int, language: str) -> List[Dict]:
46
  """Search Google Images for cooking content"""
@@ -53,7 +104,8 @@ class ImageSearchEngine:
53
  'q': cooking_query,
54
  'tbm': 'isch', # Image search
55
  'hl': language,
56
- 'safe': 'active'
 
57
  }
58
 
59
  response = self.session.get(url, params=params, timeout=self.timeout)
@@ -62,8 +114,11 @@ class ImageSearchEngine:
62
  soup = BeautifulSoup(response.content, 'html.parser')
63
  results = []
64
 
65
- # Find image containers
66
  image_containers = soup.find_all('div', class_='islrc')
 
 
 
67
 
68
  for container in image_containers[:num_results]:
69
  try:
@@ -72,12 +127,12 @@ class ImageSearchEngine:
72
  if not img_tag:
73
  continue
74
 
75
- img_url = img_tag.get('src') or img_tag.get('data-src')
76
  if not img_url or not img_url.startswith('http'):
77
  continue
78
 
79
  # Extract title/alt text
80
- title = img_tag.get('alt', '') or img_tag.get('title', '')
81
 
82
  # Extract source URL
83
  link_tag = container.find('a')
@@ -112,7 +167,7 @@ class ImageSearchEngine:
112
  'qft': '+filterui:imagesize-large', # Large images
113
  'form': 'HDRSC2',
114
  'first': '1',
115
- 'count': num_results
116
  }
117
 
118
  response = self.session.get(url, params=params, timeout=self.timeout)
@@ -121,8 +176,11 @@ class ImageSearchEngine:
121
  soup = BeautifulSoup(response.content, 'html.parser')
122
  results = []
123
 
124
- # Find image containers
125
  image_containers = soup.find_all('div', class_='img_cont')
 
 
 
126
 
127
  for container in image_containers[:num_results]:
128
  try:
@@ -130,16 +188,20 @@ class ImageSearchEngine:
130
  if not img_tag:
131
  continue
132
 
133
- img_url = img_tag.get('src') or img_tag.get('data-src')
134
  if not img_url or not img_url.startswith('http'):
135
  continue
136
 
137
- title = img_tag.get('alt', '') or img_tag.get('title', '')
 
 
 
 
138
 
139
  results.append({
140
  'url': img_url,
141
  'title': title,
142
- 'source_url': '',
143
  'source': 'bing_images',
144
  'type': 'image'
145
  })
@@ -167,8 +229,11 @@ class ImageSearchEngine:
167
  soup = BeautifulSoup(response.content, 'html.parser')
168
  results = []
169
 
170
- # Find image containers
171
  image_containers = soup.find_all('figure')
 
 
 
172
 
173
  for container in image_containers[:num_results]:
174
  try:
@@ -176,11 +241,11 @@ class ImageSearchEngine:
176
  if not img_tag:
177
  continue
178
 
179
- img_url = img_tag.get('src') or img_tag.get('data-src')
180
  if not img_url or not img_url.startswith('http'):
181
  continue
182
 
183
- title = img_tag.get('alt', '') or img_tag.get('title', '')
184
 
185
  # Get source URL
186
  link_tag = container.find('a')
 
18
  self.timeout = timeout
19
 
20
  def search_cooking_images(self, query: str, num_results: int = 3, language: str = "en") -> List[Dict]:
21
+ """Search for cooking-related images with robust error handling"""
22
+ if not query or not query.strip():
23
+ logger.warning("Empty query provided for image search")
24
+ return []
25
+
26
  results = []
27
 
28
  # Try multiple image search strategies
 
36
  try:
37
  strategy_results = strategy(query, num_results, language)
38
  if strategy_results:
39
+ # Filter and validate results
40
+ valid_results = self._validate_image_results(strategy_results)
41
+ if valid_results:
42
+ results.extend(valid_results)
43
+ logger.info(f"Image search strategy found {len(valid_results)} valid results")
44
+ if len(results) >= num_results:
45
+ break
46
  except Exception as e:
47
  logger.warning(f"Image search strategy failed: {e}")
48
  continue
49
 
50
+ # Remove duplicates and return
51
+ unique_results = self._remove_duplicate_images(results)
52
+ final_results = unique_results[:num_results]
53
+
54
+ logger.info(f"Image search completed: {len(final_results)} unique results from {len(results)} total")
55
+ return final_results
56
+
57
+ def _validate_image_results(self, results: List[Dict]) -> List[Dict]:
58
+ """Validate and clean image results"""
59
+ valid_results = []
60
+
61
+ for result in results:
62
+ try:
63
+ # Check required fields
64
+ if not result.get('url') or not result.get('url').startswith('http'):
65
+ continue
66
+
67
+ # Ensure we have at least a basic title
68
+ if not result.get('title'):
69
+ result['title'] = 'Cooking image'
70
+
71
+ # Ensure we have alt text
72
+ if not result.get('alt_text'):
73
+ result['alt_text'] = result.get('title', 'Cooking image')
74
+
75
+ valid_results.append(result)
76
+
77
+ except Exception as e:
78
+ logger.debug(f"Invalid image result skipped: {e}")
79
+ continue
80
+
81
+ return valid_results
82
+
83
+ def _remove_duplicate_images(self, results: List[Dict]) -> List[Dict]:
84
+ """Remove duplicate images based on URL"""
85
+ seen_urls = set()
86
+ unique_results = []
87
+
88
+ for result in results:
89
+ url = result.get('url', '')
90
+ if url and url not in seen_urls:
91
+ seen_urls.add(url)
92
+ unique_results.append(result)
93
+
94
+ return unique_results
95
 
96
  def _search_google_images(self, query: str, num_results: int, language: str) -> List[Dict]:
97
  """Search Google Images for cooking content"""
 
104
  'q': cooking_query,
105
  'tbm': 'isch', # Image search
106
  'hl': language,
107
+ 'safe': 'active',
108
+ 'num': min(num_results, 20) # Limit results
109
  }
110
 
111
  response = self.session.get(url, params=params, timeout=self.timeout)
 
114
  soup = BeautifulSoup(response.content, 'html.parser')
115
  results = []
116
 
117
+ # Find image containers with multiple selectors
118
  image_containers = soup.find_all('div', class_='islrc')
119
+ if not image_containers:
120
+ # Try alternative selectors
121
+ image_containers = soup.find_all('div', {'data-ved': True})
122
 
123
  for container in image_containers[:num_results]:
124
  try:
 
127
  if not img_tag:
128
  continue
129
 
130
+ img_url = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-original')
131
  if not img_url or not img_url.startswith('http'):
132
  continue
133
 
134
  # Extract title/alt text
135
+ title = img_tag.get('alt', '') or img_tag.get('title', '') or 'Cooking image'
136
 
137
  # Extract source URL
138
  link_tag = container.find('a')
 
167
  'qft': '+filterui:imagesize-large', # Large images
168
  'form': 'HDRSC2',
169
  'first': '1',
170
+ 'count': min(num_results, 20)
171
  }
172
 
173
  response = self.session.get(url, params=params, timeout=self.timeout)
 
176
  soup = BeautifulSoup(response.content, 'html.parser')
177
  results = []
178
 
179
+ # Find image containers with multiple selectors
180
  image_containers = soup.find_all('div', class_='img_cont')
181
+ if not image_containers:
182
+ # Try alternative selectors
183
+ image_containers = soup.find_all('div', {'class': 'imgpt'})
184
 
185
  for container in image_containers[:num_results]:
186
  try:
 
188
  if not img_tag:
189
  continue
190
 
191
+ img_url = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-original')
192
  if not img_url or not img_url.startswith('http'):
193
  continue
194
 
195
+ title = img_tag.get('alt', '') or img_tag.get('title', '') or 'Cooking image'
196
+
197
+ # Try to get source URL
198
+ link_tag = container.find('a')
199
+ source_url = link_tag.get('href', '') if link_tag else ''
200
 
201
  results.append({
202
  'url': img_url,
203
  'title': title,
204
+ 'source_url': source_url,
205
  'source': 'bing_images',
206
  'type': 'image'
207
  })
 
229
  soup = BeautifulSoup(response.content, 'html.parser')
230
  results = []
231
 
232
+ # Find image containers with multiple selectors
233
  image_containers = soup.find_all('figure')
234
+ if not image_containers:
235
+ # Try alternative selectors
236
+ image_containers = soup.find_all('div', {'class': 'MorZF'})
237
 
238
  for container in image_containers[:num_results]:
239
  try:
 
241
  if not img_tag:
242
  continue
243
 
244
+ img_url = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-original')
245
  if not img_url or not img_url.startswith('http'):
246
  continue
247
 
248
+ title = img_tag.get('alt', '') or img_tag.get('title', '') or 'Cooking image'
249
 
250
  # Get source URL
251
  link_tag = container.find('a')
search/search.py CHANGED
@@ -299,20 +299,21 @@ def search_comprehensive(query: str, num_results: int = 15, target_language: str
299
  except Exception as e:
300
  logger.warning(f"Simple fallback search failed: {e}")
301
 
 
 
 
 
 
 
 
 
 
 
 
302
  # Search for videos if requested (limit to avoid over-fetching)
303
  video_results = []
304
  if include_videos:
305
  try:
306
- # Map language codes for video search
307
- lang_mapping = {
308
- 'EN': 'en',
309
- 'VI': 'vi',
310
- 'ZH': 'zh',
311
- 'en': 'en',
312
- 'vi': 'vi',
313
- 'zh': 'zh'
314
- }
315
- search_language = lang_mapping.get(target_language, 'en')
316
  # Limit video results to avoid over-fetching
317
  max_video_results = min(5, num_results // 3) # Max 5 or 1/3 of total
318
  video_results = video_engine.search(boosted_query, num_results=max_video_results, language=search_language)
 
299
  except Exception as e:
300
  logger.warning(f"Simple fallback search failed: {e}")
301
 
302
+ # Map language codes for search engines
303
+ lang_mapping = {
304
+ 'EN': 'en',
305
+ 'VI': 'vi',
306
+ 'ZH': 'zh',
307
+ 'en': 'en',
308
+ 'vi': 'vi',
309
+ 'zh': 'zh'
310
+ }
311
+ search_language = lang_mapping.get(target_language, 'en')
312
+
313
  # Search for videos if requested (limit to avoid over-fetching)
314
  video_results = []
315
  if include_videos:
316
  try:
 
 
 
 
 
 
 
 
 
 
317
  # Limit video results to avoid over-fetching
318
  max_video_results = min(5, num_results // 3) # Max 5 or 1/3 of total
319
  video_results = video_engine.search(boosted_query, num_results=max_video_results, language=search_language)