Spaces:
Sleeping
Sleeping
Commit
·
cd38d69
1
Parent(s):
fe3e2c5
Rm image as citation #2
Browse files- api/chatbot.py +27 -3
api/chatbot.py
CHANGED
|
@@ -513,17 +513,41 @@ class CookingTutorChatbot:
|
|
| 513 |
|
| 514 |
# Remove common image URL patterns that might appear in text
|
| 515 |
image_url_patterns = [
|
| 516 |
-
|
| 517 |
-
r'
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
]
|
| 520 |
|
| 521 |
cleaned_text = text
|
| 522 |
for pattern in image_url_patterns:
|
| 523 |
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
|
| 524 |
|
|
|
|
|
|
|
|
|
|
| 525 |
# Clean up any extra whitespace left behind
|
| 526 |
cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
|
|
|
|
| 527 |
cleaned_text = cleaned_text.strip()
|
| 528 |
|
| 529 |
return cleaned_text
|
|
|
|
| 513 |
|
| 514 |
# Remove common image URL patterns that might appear in text
|
| 515 |
image_url_patterns = [
|
| 516 |
+
# Direct image file extensions
|
| 517 |
+
r'https?://[^\s]+\.(jpg|jpeg|png|gif|webp|svg|bmp|tiff)(\?[^\s]*)?',
|
| 518 |
+
|
| 519 |
+
# Bing image URLs (like the one provided)
|
| 520 |
+
r'https?://tse\d+\.mm\.bing\.net/[^\s]+',
|
| 521 |
+
|
| 522 |
+
# Google image URLs
|
| 523 |
+
r'https?://encrypted-tbn\d+\.gstatic\.com/[^\s]+',
|
| 524 |
+
r'https?://images\d+\.googleusercontent\.com/[^\s]+',
|
| 525 |
+
|
| 526 |
+
# Other common image hosting services
|
| 527 |
+
r'https?://[^\s]*imgur[^\s]*\.(jpg|jpeg|png|gif|webp)',
|
| 528 |
+
r'https?://[^\s]*unsplash[^\s]*\.(jpg|jpeg|png|gif|webp)',
|
| 529 |
+
r'https?://[^\s]*pixabay[^\s]*\.(jpg|jpeg|png|gif|webp)',
|
| 530 |
+
|
| 531 |
+
# HTML img tags
|
| 532 |
+
r'<img[^>]*src=["\']([^"\']+)["\'][^>]*>',
|
| 533 |
+
|
| 534 |
+
# Markdown image syntax
|
| 535 |
+
r'!\[[^\]]*\]\([^)]+\)',
|
| 536 |
+
|
| 537 |
+
# URLs with image-related parameters
|
| 538 |
+
r'https?://[^\s]*\?(.*&)?(w=\d+|h=\d+|c=\d+|r=\d+|o=\d+|cb=\d+|pid=\d+|rm=\d+)(&.*)?',
|
| 539 |
]
|
| 540 |
|
| 541 |
cleaned_text = text
|
| 542 |
for pattern in image_url_patterns:
|
| 543 |
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
|
| 544 |
|
| 545 |
+
# Remove standalone URLs that start with @ (like @https://...)
|
| 546 |
+
cleaned_text = re.sub(r'@https?://[^\s]+', '', cleaned_text)
|
| 547 |
+
|
| 548 |
# Clean up any extra whitespace left behind
|
| 549 |
cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
|
| 550 |
+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with single space
|
| 551 |
cleaned_text = cleaned_text.strip()
|
| 552 |
|
| 553 |
return cleaned_text
|