Spaces:
Sleeping
Sleeping
Commit
·
07c35d1
1
Parent(s):
43c0263
Upd citation parser
Browse files- api/chatbot.py +101 -20
- api/routes.py +1 -1
api/chatbot.py
CHANGED
|
@@ -429,10 +429,47 @@ class CookingTutorChatbot:
|
|
| 429 |
return structured_blocks
|
| 430 |
|
| 431 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
| 432 |
-
"""Replace citation tags with actual URLs, handling
|
| 433 |
|
| 434 |
-
#
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
def replace_citation(match):
|
| 438 |
citation_content = match.group(1)
|
|
@@ -441,29 +478,73 @@ class CookingTutorChatbot:
|
|
| 441 |
|
| 442 |
urls = []
|
| 443 |
for citation_id in citation_ids:
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
else:
|
| 451 |
logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
|
| 452 |
-
|
| 453 |
-
except ValueError:
|
| 454 |
-
logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
|
| 455 |
-
urls.append(f'<#{citation_id}>') # Keep original if invalid
|
| 456 |
|
| 457 |
# Join multiple URLs with spaces
|
| 458 |
return ' '.join(urls)
|
| 459 |
|
| 460 |
-
#
|
| 461 |
-
processed_response =
|
|
|
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
| 469 |
return processed_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
return structured_blocks
|
| 430 |
|
| 431 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
| 432 |
+
"""Replace citation tags with actual URLs, handling various citation formats flexibly"""
|
| 433 |
|
| 434 |
+
# More flexible pattern to match various citation formats
|
| 435 |
+
citation_patterns = [
|
| 436 |
+
r'<#([^>]+)>', # Standard format: <#1>, <#1,2,3>
|
| 437 |
+
r'<#ID\s*(\d+)>', # Format: <#ID 1>, <#ID 3>
|
| 438 |
+
r'<#\s*ID\s*(\d+)>', # Format: <# ID 1>
|
| 439 |
+
r'<#(\d+)>', # Simple format: <#1>
|
| 440 |
+
r'<#\s*(\d+)\s*>', # Format with spaces: <# 1 >
|
| 441 |
+
]
|
| 442 |
+
|
| 443 |
+
def extract_numeric_id(citation_id: str) -> int:
|
| 444 |
+
"""Extract numeric ID from various citation formats"""
|
| 445 |
+
if not citation_id:
|
| 446 |
+
return None
|
| 447 |
+
|
| 448 |
+
# Remove common prefixes and suffixes
|
| 449 |
+
cleaned = citation_id.strip()
|
| 450 |
+
|
| 451 |
+
# Handle various formats
|
| 452 |
+
if cleaned.upper().startswith('ID'):
|
| 453 |
+
cleaned = cleaned[2:].strip()
|
| 454 |
+
elif cleaned.startswith('#'):
|
| 455 |
+
cleaned = cleaned[1:].strip()
|
| 456 |
+
if cleaned.upper().startswith('ID'):
|
| 457 |
+
cleaned = cleaned[2:].strip()
|
| 458 |
+
|
| 459 |
+
# Remove any remaining non-numeric characters except spaces
|
| 460 |
+
import re
|
| 461 |
+
cleaned = re.sub(r'[^\d\s]', '', cleaned).strip()
|
| 462 |
+
|
| 463 |
+
# Extract first number found
|
| 464 |
+
numbers = re.findall(r'\d+', cleaned)
|
| 465 |
+
if numbers:
|
| 466 |
+
return int(numbers[0])
|
| 467 |
+
|
| 468 |
+
# Try direct conversion as fallback
|
| 469 |
+
try:
|
| 470 |
+
return int(cleaned)
|
| 471 |
+
except ValueError:
|
| 472 |
+
return None
|
| 473 |
|
| 474 |
def replace_citation(match):
|
| 475 |
citation_content = match.group(1)
|
|
|
|
| 478 |
|
| 479 |
urls = []
|
| 480 |
for citation_id in citation_ids:
|
| 481 |
+
# Extract numeric ID from various formats
|
| 482 |
+
doc_id = extract_numeric_id(citation_id)
|
| 483 |
+
|
| 484 |
+
if doc_id is not None and doc_id in url_mapping:
|
| 485 |
+
url = url_mapping[doc_id]
|
| 486 |
+
urls.append(f'<{url}>')
|
| 487 |
+
logger.info(f"[CITATION] Replacing <#{citation_id}> with {url}")
|
| 488 |
+
else:
|
| 489 |
+
if doc_id is None:
|
| 490 |
+
logger.warning(f"[CITATION] Could not extract numeric ID from: {citation_id}")
|
| 491 |
else:
|
| 492 |
logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
|
| 493 |
+
urls.append(f'<#{citation_id}>') # Keep original if URL not found
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
# Join multiple URLs with spaces
|
| 496 |
return ' '.join(urls)
|
| 497 |
|
| 498 |
+
# Process with each pattern
|
| 499 |
+
processed_response = response
|
| 500 |
+
total_citations_processed = 0
|
| 501 |
|
| 502 |
+
for pattern in citation_patterns:
|
| 503 |
+
# Count citations before processing
|
| 504 |
+
citations_found = re.findall(pattern, processed_response)
|
| 505 |
+
if citations_found:
|
| 506 |
+
# Process citations with this pattern
|
| 507 |
+
processed_response = re.sub(pattern, replace_citation, processed_response)
|
| 508 |
+
total_citations_processed += sum(len([id_str.strip() for id_str in citation_content.split(',')])
|
| 509 |
+
for citation_content in citations_found)
|
| 510 |
+
logger.info(f"[CITATION] Processed {len(citations_found)} citation groups with pattern: {pattern}")
|
| 511 |
|
| 512 |
+
# Fallback: Handle any remaining malformed citations
|
| 513 |
+
processed_response = self._handle_malformed_citations(processed_response, url_mapping)
|
| 514 |
+
|
| 515 |
+
logger.info(f"[CITATION] Total citations processed: {total_citations_processed}, URL mappings available: {len(url_mapping)}")
|
| 516 |
return processed_response
|
| 517 |
+
|
| 518 |
+
def _handle_malformed_citations(self, text: str, url_mapping: Dict[int, str]) -> str:
|
| 519 |
+
"""Handle any remaining malformed citations that didn't match our patterns"""
|
| 520 |
+
import re
|
| 521 |
+
|
| 522 |
+
# Look for any remaining citation-like patterns
|
| 523 |
+
malformed_patterns = [
|
| 524 |
+
r'<#\s*ID\s*\d+\s*>', # <# ID 1 >
|
| 525 |
+
r'<#\s*ID\s*\d+>', # <# ID 1>
|
| 526 |
+
r'<#ID\s*\d+\s*>', # <#ID 1 >
|
| 527 |
+
r'<#\s*\d+\s*ID\s*>', # <# 1 ID >
|
| 528 |
+
r'<#\s*\d+\s*ID>', # <# 1 ID>
|
| 529 |
+
]
|
| 530 |
+
|
| 531 |
+
def clean_malformed_citation(match):
|
| 532 |
+
citation_text = match.group(0)
|
| 533 |
+
# Extract any number from the citation
|
| 534 |
+
numbers = re.findall(r'\d+', citation_text)
|
| 535 |
+
if numbers:
|
| 536 |
+
doc_id = int(numbers[0])
|
| 537 |
+
if doc_id in url_mapping:
|
| 538 |
+
url = url_mapping[doc_id]
|
| 539 |
+
logger.info(f"[CITATION] Fixed malformed citation {citation_text} -> {url}")
|
| 540 |
+
return f'<{url}>'
|
| 541 |
+
else:
|
| 542 |
+
logger.warning(f"[CITATION] Malformed citation {citation_text} - no URL mapping for ID {doc_id}")
|
| 543 |
+
else:
|
| 544 |
+
logger.warning(f"[CITATION] Malformed citation {citation_text} - no number found")
|
| 545 |
+
return citation_text # Keep original if can't fix
|
| 546 |
+
|
| 547 |
+
for pattern in malformed_patterns:
|
| 548 |
+
text = re.sub(pattern, clean_malformed_citation, text)
|
| 549 |
+
|
| 550 |
+
return text
|
api/routes.py
CHANGED
|
@@ -356,7 +356,7 @@ async def root():
|
|
| 356 |
<p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
|
| 357 |
<p class="version">API Version 1.0.0</p>
|
| 358 |
|
| 359 |
-
<a href="
|
| 360 |
<i class="fas fa-external-link-alt"></i>
|
| 361 |
Open Frontend
|
| 362 |
</a>
|
|
|
|
| 356 |
<p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
|
| 357 |
<p class="version">API Version 1.0.0</p>
|
| 358 |
|
| 359 |
+
<a href="https://cooking-tutor.vercel.app" class="redirect-btn" target="_blank">
|
| 360 |
<i class="fas fa-external-link-alt"></i>
|
| 361 |
Open Frontend
|
| 362 |
</a>
|