LiamKhoaLe commited on
Commit
07c35d1
·
1 Parent(s): 43c0263

Upd citation parser

Browse files
Files changed (2) hide show
  1. api/chatbot.py +101 -20
  2. api/routes.py +1 -1
api/chatbot.py CHANGED
@@ -429,10 +429,47 @@ class CookingTutorChatbot:
429
  return structured_blocks
430
 
431
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
432
- """Replace citation tags with actual URLs, handling both single and multiple references"""
433
 
434
- # Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
435
- citation_pattern = r'<#([^>]+)>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  def replace_citation(match):
438
  citation_content = match.group(1)
@@ -441,29 +478,73 @@ class CookingTutorChatbot:
441
 
442
  urls = []
443
  for citation_id in citation_ids:
444
- try:
445
- doc_id = int(citation_id)
446
- if doc_id in url_mapping:
447
- url = url_mapping[doc_id]
448
- urls.append(f'<{url}>')
449
- logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
 
 
 
 
450
  else:
451
  logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
452
- urls.append(f'<#{doc_id}>') # Keep original if URL not found
453
- except ValueError:
454
- logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
455
- urls.append(f'<#{citation_id}>') # Keep original if invalid
456
 
457
  # Join multiple URLs with spaces
458
  return ' '.join(urls)
459
 
460
- # Replace citations with URLs
461
- processed_response = re.sub(citation_pattern, replace_citation, response)
 
462
 
463
- # Count total citations processed
464
- citations_found = re.findall(citation_pattern, response)
465
- total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
466
- for citation_content in citations_found)
 
 
 
 
 
467
 
468
- logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
 
 
 
469
  return processed_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  return structured_blocks
430
 
431
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
432
+ """Replace citation tags with actual URLs, handling various citation formats flexibly"""
433
 
434
+ # More flexible pattern to match various citation formats
435
+ citation_patterns = [
436
+ r'<#([^>]+)>', # Standard format: <#1>, <#1,2,3>
437
+ r'<#ID\s*(\d+)>', # Format: <#ID 1>, <#ID 3>
438
+ r'<#\s*ID\s*(\d+)>', # Format: <# ID 1>
439
+ r'<#(\d+)>', # Simple format: <#1>
440
+ r'<#\s*(\d+)\s*>', # Format with spaces: <# 1 >
441
+ ]
442
+
443
+ def extract_numeric_id(citation_id: str) -> int:
444
+ """Extract numeric ID from various citation formats"""
445
+ if not citation_id:
446
+ return None
447
+
448
+ # Remove common prefixes and suffixes
449
+ cleaned = citation_id.strip()
450
+
451
+ # Handle various formats
452
+ if cleaned.upper().startswith('ID'):
453
+ cleaned = cleaned[2:].strip()
454
+ elif cleaned.startswith('#'):
455
+ cleaned = cleaned[1:].strip()
456
+ if cleaned.upper().startswith('ID'):
457
+ cleaned = cleaned[2:].strip()
458
+
459
+ # Remove any remaining non-numeric characters except spaces
460
+ import re
461
+ cleaned = re.sub(r'[^\d\s]', '', cleaned).strip()
462
+
463
+ # Extract first number found
464
+ numbers = re.findall(r'\d+', cleaned)
465
+ if numbers:
466
+ return int(numbers[0])
467
+
468
+ # Try direct conversion as fallback
469
+ try:
470
+ return int(cleaned)
471
+ except ValueError:
472
+ return None
473
 
474
  def replace_citation(match):
475
  citation_content = match.group(1)
 
478
 
479
  urls = []
480
  for citation_id in citation_ids:
481
+ # Extract numeric ID from various formats
482
+ doc_id = extract_numeric_id(citation_id)
483
+
484
+ if doc_id is not None and doc_id in url_mapping:
485
+ url = url_mapping[doc_id]
486
+ urls.append(f'<{url}>')
487
+ logger.info(f"[CITATION] Replacing <#{citation_id}> with {url}")
488
+ else:
489
+ if doc_id is None:
490
+ logger.warning(f"[CITATION] Could not extract numeric ID from: {citation_id}")
491
  else:
492
  logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
493
+ urls.append(f'<#{citation_id}>') # Keep original if URL not found
 
 
 
494
 
495
  # Join multiple URLs with spaces
496
  return ' '.join(urls)
497
 
498
+ # Process with each pattern
499
+ processed_response = response
500
+ total_citations_processed = 0
501
 
502
+ for pattern in citation_patterns:
503
+ # Count citations before processing
504
+ citations_found = re.findall(pattern, processed_response)
505
+ if citations_found:
506
+ # Process citations with this pattern
507
+ processed_response = re.sub(pattern, replace_citation, processed_response)
508
+ total_citations_processed += sum(len([id_str.strip() for id_str in citation_content.split(',')])
509
+ for citation_content in citations_found)
510
+ logger.info(f"[CITATION] Processed {len(citations_found)} citation groups with pattern: {pattern}")
511
 
512
+ # Fallback: Handle any remaining malformed citations
513
+ processed_response = self._handle_malformed_citations(processed_response, url_mapping)
514
+
515
+ logger.info(f"[CITATION] Total citations processed: {total_citations_processed}, URL mappings available: {len(url_mapping)}")
516
  return processed_response
517
+
518
+ def _handle_malformed_citations(self, text: str, url_mapping: Dict[int, str]) -> str:
519
+ """Handle any remaining malformed citations that didn't match our patterns"""
520
+ import re
521
+
522
+ # Look for any remaining citation-like patterns
523
+ malformed_patterns = [
524
+ r'<#\s*ID\s*\d+\s*>', # <# ID 1 >
525
+ r'<#\s*ID\s*\d+>', # <# ID 1>
526
+ r'<#ID\s*\d+\s*>', # <#ID 1 >
527
+ r'<#\s*\d+\s*ID\s*>', # <# 1 ID >
528
+ r'<#\s*\d+\s*ID>', # <# 1 ID>
529
+ ]
530
+
531
+ def clean_malformed_citation(match):
532
+ citation_text = match.group(0)
533
+ # Extract any number from the citation
534
+ numbers = re.findall(r'\d+', citation_text)
535
+ if numbers:
536
+ doc_id = int(numbers[0])
537
+ if doc_id in url_mapping:
538
+ url = url_mapping[doc_id]
539
+ logger.info(f"[CITATION] Fixed malformed citation {citation_text} -> {url}")
540
+ return f'<{url}>'
541
+ else:
542
+ logger.warning(f"[CITATION] Malformed citation {citation_text} - no URL mapping for ID {doc_id}")
543
+ else:
544
+ logger.warning(f"[CITATION] Malformed citation {citation_text} - no number found")
545
+ return citation_text # Keep original if can't fix
546
+
547
+ for pattern in malformed_patterns:
548
+ text = re.sub(pattern, clean_malformed_citation, text)
549
+
550
+ return text
api/routes.py CHANGED
@@ -356,7 +356,7 @@ async def root():
356
  <p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
357
  <p class="version">API Version 1.0.0</p>
358
 
359
- <a href="/" class="redirect-btn" target="_blank">
360
  <i class="fas fa-external-link-alt"></i>
361
  Open Frontend
362
  </a>
 
356
  <p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
357
  <p class="version">API Version 1.0.0</p>
358
 
359
+ <a href="https://cooking-tutor.vercel.app" class="redirect-btn" target="_blank">
360
  <i class="fas fa-external-link-alt"></i>
361
  Open Frontend
362
  </a>