Spaces:
Sleeping
Sleeping
Commit
·
232d88f
1
Parent(s):
f00c357
Upd file spec retrieval alg
Browse files
app.py
CHANGED
|
@@ -597,7 +597,9 @@ async def chat(
|
|
| 597 |
|
| 598 |
# 0) Detect any filenames mentioned in the question (e.g., JADE.pdf)
|
| 599 |
# Supports .pdf, .docx, and .doc for detection purposes
|
| 600 |
-
mentioned = set([m.group(0) for m in re.finditer(r"[\w\-\. ]+\.(?:pdf|docx|doc)\b", question, re.IGNORECASE)])
|
|
|
|
|
|
|
| 601 |
|
| 602 |
# 0a) If the question explicitly asks for a summary/about of a single mentioned file, return its summary directly
|
| 603 |
if mentioned and (re.search(r"\b(summary|summarize|about|overview)\b", question, re.IGNORECASE)):
|
|
@@ -632,16 +634,28 @@ async def chat(
|
|
| 632 |
key = mfn.lower()
|
| 633 |
if key in filenames_ci_map:
|
| 634 |
mentioned_normalized.append(filenames_ci_map[key])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
|
| 636 |
# 1b) Ask NVIDIA to mark relevance per file
|
| 637 |
relevant_map = await files_relevance(question, files_list, nvidia_rotator)
|
| 638 |
relevant_files = [fn for fn, ok in relevant_map.items() if ok]
|
|
|
|
| 639 |
|
| 640 |
# 1c) Ensure any explicitly mentioned files in the question are included
|
| 641 |
# This safeguards against model misclassification
|
| 642 |
if mentioned_normalized:
|
| 643 |
extra = [fn for fn in mentioned_normalized if fn not in relevant_files]
|
| 644 |
relevant_files.extend(extra)
|
|
|
|
|
|
|
| 645 |
|
| 646 |
# 2) Memory context: recent 3 via NVIDIA, remaining 17 via semantic
|
| 647 |
# recent 3 related (we do a simple include-all; NVIDIA will prune by "related" selection using the same mechanism as files_relevance but here handled in history)
|
|
@@ -681,11 +695,45 @@ async def chat(
|
|
| 681 |
filenames=relevant_files if relevant_files else None
|
| 682 |
)
|
| 683 |
if not hits:
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
# Compose context
|
| 690 |
contexts = []
|
| 691 |
sources_meta = []
|
|
|
|
| 597 |
|
| 598 |
# 0) Detect any filenames mentioned in the question (e.g., JADE.pdf)
|
| 599 |
# Supports .pdf, .docx, and .doc for detection purposes
|
| 600 |
+
mentioned = set([m.group(0).strip() for m in re.finditer(r"[\w\-\. ]+\.(?:pdf|docx|doc)\b", question, re.IGNORECASE)])
|
| 601 |
+
if mentioned:
|
| 602 |
+
logger.info(f"[CHAT] Detected mentioned filenames in question: {list(mentioned)}")
|
| 603 |
|
| 604 |
# 0a) If the question explicitly asks for a summary/about of a single mentioned file, return its summary directly
|
| 605 |
if mentioned and (re.search(r"\b(summary|summarize|about|overview)\b", question, re.IGNORECASE)):
|
|
|
|
| 634 |
key = mfn.lower()
|
| 635 |
if key in filenames_ci_map:
|
| 636 |
mentioned_normalized.append(filenames_ci_map[key])
|
| 637 |
+
if mentioned and not mentioned_normalized and files_list:
|
| 638 |
+
# Try looser match: contained filenames ignoring spaces
|
| 639 |
+
norm = {f.get("filename", "").lower().replace(" ", ""): f.get("filename") for f in files_list if f.get("filename")}
|
| 640 |
+
for mfn in mentioned:
|
| 641 |
+
key2 = mfn.lower().replace(" ", "")
|
| 642 |
+
if key2 in norm:
|
| 643 |
+
mentioned_normalized.append(norm[key2])
|
| 644 |
+
if mentioned_normalized:
|
| 645 |
+
logger.info(f"[CHAT] Normalized mentions to stored filenames: {mentioned_normalized}")
|
| 646 |
|
| 647 |
# 1b) Ask NVIDIA to mark relevance per file
|
| 648 |
relevant_map = await files_relevance(question, files_list, nvidia_rotator)
|
| 649 |
relevant_files = [fn for fn, ok in relevant_map.items() if ok]
|
| 650 |
+
logger.info(f"[CHAT] NVIDIA relevant files: {relevant_files}")
|
| 651 |
|
| 652 |
# 1c) Ensure any explicitly mentioned files in the question are included
|
| 653 |
# This safeguards against model misclassification
|
| 654 |
if mentioned_normalized:
|
| 655 |
extra = [fn for fn in mentioned_normalized if fn not in relevant_files]
|
| 656 |
relevant_files.extend(extra)
|
| 657 |
+
if extra:
|
| 658 |
+
logger.info(f"[CHAT] Forced-include mentioned files into relevance: {extra}")
|
| 659 |
|
| 660 |
# 2) Memory context: recent 3 via NVIDIA, remaining 17 via semantic
|
| 661 |
# recent 3 related (we do a simple include-all; NVIDIA will prune by "related" selection using the same mechanism as files_relevance but here handled in history)
|
|
|
|
| 695 |
filenames=relevant_files if relevant_files else None
|
| 696 |
)
|
| 697 |
if not hits:
|
| 698 |
+
logger.info(f"[CHAT] No hits with relevance filter. relevant_files={relevant_files}")
|
| 699 |
+
# Retry 1: if we have explicit mentions, try restricting only to them
|
| 700 |
+
if mentioned_normalized:
|
| 701 |
+
hits = rag.vector_search(
|
| 702 |
+
user_id=user_id,
|
| 703 |
+
project_id=project_id,
|
| 704 |
+
query_vector=q_vec,
|
| 705 |
+
k=k,
|
| 706 |
+
filenames=mentioned_normalized
|
| 707 |
+
)
|
| 708 |
+
logger.info(f"[CHAT] Retry with mentioned files only → hits={len(hits) if hits else 0}")
|
| 709 |
+
# Retry 2: if still empty, try without any filename restriction
|
| 710 |
+
if not hits:
|
| 711 |
+
hits = rag.vector_search(
|
| 712 |
+
user_id=user_id,
|
| 713 |
+
project_id=project_id,
|
| 714 |
+
query_vector=q_vec,
|
| 715 |
+
k=k,
|
| 716 |
+
filenames=None
|
| 717 |
+
)
|
| 718 |
+
logger.info(f"[CHAT] Retry with all files → hits={len(hits) if hits else 0}")
|
| 719 |
+
# If still no hits, and we have mentioned files, try returning their summaries if present
|
| 720 |
+
if not hits and mentioned_normalized:
|
| 721 |
+
fsum_map = {f["filename"]: f.get("summary", "") for f in files_list}
|
| 722 |
+
summaries = [fsum_map.get(fn, "") for fn in mentioned_normalized]
|
| 723 |
+
summaries = [s for s in summaries if s]
|
| 724 |
+
if summaries:
|
| 725 |
+
answer = ("\n\n---\n\n").join(summaries)
|
| 726 |
+
return ChatAnswerResponse(
|
| 727 |
+
answer=answer,
|
| 728 |
+
sources=[{"filename": fn, "file_summary": True} for fn in mentioned_normalized],
|
| 729 |
+
relevant_files=mentioned_normalized
|
| 730 |
+
)
|
| 731 |
+
if not hits:
|
| 732 |
+
return ChatAnswerResponse(
|
| 733 |
+
answer="I don't know based on your uploaded materials. Try uploading more sources or rephrasing the question.",
|
| 734 |
+
sources=[],
|
| 735 |
+
relevant_files=relevant_files or mentioned_normalized
|
| 736 |
+
)
|
| 737 |
# Compose context
|
| 738 |
contexts = []
|
| 739 |
sources_meta = []
|