LiamKhoaLe commited on
Commit
cd38d69
·
1 Parent(s): fe3e2c5

Rm image as citation #2

Browse files
Files changed (1) hide show
  1. api/chatbot.py +27 -3
api/chatbot.py CHANGED
@@ -513,17 +513,41 @@ class CookingTutorChatbot:
513
 
514
  # Remove common image URL patterns that might appear in text
515
  image_url_patterns = [
516
- r'https?://[^\s]+\.(jpg|jpeg|png|gif|webp|svg)(\?[^\s]*)?', # Direct image URLs
517
- r'<img[^>]*src=["\']([^"\']+)["\'][^>]*>', # HTML img tags
518
- r'!\[[^\]]*\]\([^)]+\)', # Markdown image syntax
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  ]
520
 
521
  cleaned_text = text
522
  for pattern in image_url_patterns:
523
  cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
524
 
 
 
 
525
  # Clean up any extra whitespace left behind
526
  cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
 
527
  cleaned_text = cleaned_text.strip()
528
 
529
  return cleaned_text
 
513
 
514
  # Remove common image URL patterns that might appear in text
515
  image_url_patterns = [
516
+ # Direct image file extensions
517
+ r'https?://[^\s]+\.(jpg|jpeg|png|gif|webp|svg|bmp|tiff)(\?[^\s]*)?',
518
+
519
+ # Bing image URLs (like the one provided)
520
+ r'https?://tse\d+\.mm\.bing\.net/[^\s]+',
521
+
522
+ # Google image URLs
523
+ r'https?://encrypted-tbn\d+\.gstatic\.com/[^\s]+',
524
+ r'https?://images\d+\.googleusercontent\.com/[^\s]+',
525
+
526
+ # Other common image hosting services
527
+ r'https?://[^\s]*imgur[^\s]*\.(jpg|jpeg|png|gif|webp)',
528
+ r'https?://[^\s]*unsplash[^\s]*\.(jpg|jpeg|png|gif|webp)',
529
+ r'https?://[^\s]*pixabay[^\s]*\.(jpg|jpeg|png|gif|webp)',
530
+
531
+ # HTML img tags
532
+ r'<img[^>]*src=["\']([^"\']+)["\'][^>]*>',
533
+
534
+ # Markdown image syntax
535
+ r'!\[[^\]]*\]\([^)]+\)',
536
+
537
+ # URLs with image-related parameters
538
+ r'https?://[^\s]*\?(.*&)?(w=\d+|h=\d+|c=\d+|r=\d+|o=\d+|cb=\d+|pid=\d+|rm=\d+)(&.*)?',
539
  ]
540
 
541
  cleaned_text = text
542
  for pattern in image_url_patterns:
543
  cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
544
 
545
+ # Remove standalone URLs that start with @ (like @https://...)
546
+ cleaned_text = re.sub(r'@https?://[^\s]+', '', cleaned_text)
547
+
548
  # Clean up any extra whitespace left behind
549
  cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
550
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with single space
551
  cleaned_text = cleaned_text.strip()
552
 
553
  return cleaned_text