Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Oct 11

Commit

e30c5c2

verified ·

1 Parent(s): 37dcc6f

adding FIlterer process for better Deep_Research reports

Browse files

Files changed (1) hide show

Modules/Deep_Research.py +208 -79

Modules/Deep_Research.py CHANGED Viewed

@@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 import time
-from collections import deque
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 from datetime import datetime
-from typing import Annotated, Dict, List, Tuple
 from urllib.parse import urlparse
 import gradio as gr
@@ -63,6 +63,14 @@ RESEARCHER_SYSTEM_PROMPT = (
     "</planning_rules>\n\n"
 )
 class SlowHost(Exception):
     pass
@@ -161,6 +169,51 @@ def _build_research_prompt(summary: str, queries: List[str], url_list: List[str]
     return "\n\n".join(prompt_parts)
 def _write_report_tmp(text: str) -> str:
     tmp_dir = tempfile.mkdtemp(prefix="deep_research_")
     path = os.path.join(tmp_dir, "research_report.txt")
@@ -169,6 +222,76 @@ def _write_report_tmp(text: str) -> str:
     return path
 @autodoc(
     summary=TOOL_SUMMARY,
 )
@@ -217,6 +340,11 @@ def Deep_Research(
     def time_left() -> float:
         return max(0.0, deadline - time.time())
     all_urls: list[str] = []
     tasks = []
     with ThreadPoolExecutor(max_workers=min(5, sum(1 for q in queries if q.strip())) or 1) as executor:
@@ -279,71 +407,79 @@ def Deep_Research(
         return any(path.endswith(ext) for ext in skip_exts)
     all_urls = [url for url in all_urls if not _skip_url(url)]
-    pages: dict[str, str] = {}
-    if all_urls:
-        queue = deque(all_urls)
-        attempts: dict[str, int] = {url: 0 for url in all_urls}
-        max_attempts = 2
-        max_workers = min(12, max(4, len(all_urls)))
-        in_flight: dict[Future, str] = {}
-        delayed: list[tuple[float, str]] = []
-        def schedule_next(executor: ThreadPoolExecutor) -> None:
-            while queue and len(in_flight) < max_workers:
-                url = queue.popleft()
-                if url in pages:
-                    continue
-                if attempts[url] >= max_attempts:
-                    continue
-                attempts[url] += 1
-                tl = time_left()
-                per_timeout = 10.0 if tl > 15 else (5.0 if tl > 8 else 2.0)
-                future = executor.submit(_fetch_page_markdown_fast, url, 3000, per_timeout)
-                in_flight[future] = url
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            schedule_next(executor)
-            while (in_flight or queue) and time_left() > 0.2:
-                now = time.time()
-                if delayed:
-                    ready = []
-                    not_ready = []
-                    for ready_time, url in delayed:
-                        (ready if ready_time <= now else not_ready).append((ready_time, url))
-                    delayed = not_ready
-                    for _, url in ready:
-                        queue.append(url)
-                    if ready:
-                        schedule_next(executor)
-                done = [future for future in list(in_flight.keys()) if future.done()]
-                if not done:
-                    if not queue and delayed:
-                        sleep_for = max(0.02, min(0.25, max(0.0, min(t for t, _ in delayed) - time.time())))
-                        time.sleep(sleep_for)
-                    else:
-                        time.sleep(0.05)
-                else:
-                    for future in done:
-                        url = in_flight.pop(future)
-                        try:
-                            md = future.result()
-                            if md and not md.startswith("Unsupported content type") and not md.startswith("An error occurred"):
-                                pages[url] = md
-                                try:
-                                    print(f"[FETCH OK] {url} (chars={len(md)})", flush=True)
-                                except Exception:
-                                    pass
-                        except SlowHost:
-                            if time_left() > 5.0:
-                                delayed.append((time.time() + 3.0, url))
-                        except Exception:
-                            pass
-                    schedule_next(executor)
     prompt = _build_research_prompt(summary=summary or "", queries=[q for q in queries if q.strip()], url_list=list(pages.keys()), pages_map=pages)
-    now = datetime.now().astimezone()
-    date_str = now.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
-    if not date_str:
-        date_str = now.isoformat()
     system_message = {"role": "system", "content": RESEARCHER_SYSTEM_PROMPT}
     date_message = {"role": "user", "content": f"The current date is {date_str}. Return only the research report."}
     messages = [
@@ -358,19 +494,9 @@ def Deep_Research(
     print(f"[PIPELINE] Fetch complete: pages={len(pages)}, unique_urls={len(pages.keys())}, prompt_chars={prompt_chars}", flush=True)
     print("[PIPELINE] Starting inference (provider=cerebras, model=Qwen/Qwen3-235B-A22B-Thinking-2507)", flush=True)
-    def _run_inference(provider: str, max_tokens: int, temp: float, top_p: float):
-        client = InferenceClient(provider=provider, api_key=HF_TEXTGEN_TOKEN)
-        return client.chat.completions.create(
-            model="Qwen/Qwen3-235B-A22B-Thinking-2507",
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temp,
-            top_p=top_p,
-        )
     try:
         print("[LLM] Attempt 1: provider=cerebras, max_tokens=32768", flush=True)
-        completion = _run_inference("cerebras", max_tokens=32768, temp=0.3, top_p=0.95)
     except Exception as exc1:
         print(f"[LLM] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
         try:
@@ -386,12 +512,12 @@ def Deep_Research(
                 {"role": "user", "content": prompt2},
             ]
             print("[LLM] Attempt 2: provider=cerebras (trimmed), max_tokens=16384", flush=True)
-            completion = _run_inference("cerebras", max_tokens=16384, temp=0.7, top_p=0.95)
         except Exception as exc2:
             print(f"[LLM] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
             try:
                 print("[LLM] Attempt 3: provider=auto, max_tokens=8192", flush=True)
-                completion = _run_inference("auto", max_tokens=8192, temp=0.7, top_p=0.95)
             except Exception as exc3:
                 _log_call_end("Deep_Research", f"error={_truncate_for_log(str(exc3), 260)}")
                 raise gr.Error(f"Researcher model call failed: {exc3}")
@@ -423,6 +549,9 @@ def Deep_Research(
     except Exception:
         pass
     links_text = "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
     file_path = _write_report_tmp(report)
     elapsed = time.time() - start_ts
     print(f"[TIMING] Deep_Research elapsed: {elapsed:.2f}s", flush=True)

 import re
 import tempfile
 import time
+from collections import OrderedDict, deque
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 from datetime import datetime
+from typing import Annotated, Callable, Dict, List, Tuple
 from urllib.parse import urlparse
 import gradio as gr
     "</planning_rules>\n\n"
 )
+FILTERER_SYSTEM_PROMPT = (
+    "You are Nymbot Filterer, an analyst who selects the most relevant sources for a research task. "
+    "You will be given a summary of the research topic (and optional search queries) followed by multiple fetched documents. "
+    "Each document includes its URL and a truncated excerpt. Evaluate how well each source helps answer the research topic. "
+    "Return only the URLs that should be used for the final research step. Output plain text with exactly one URL per line and no additional commentary, bullets, numbering, or explanations. "
+    "If no sources are relevant, return an empty string."
+)
 class SlowHost(Exception):
     pass
     return "\n\n".join(prompt_parts)
+def _build_filter_prompt(summary: str, queries: List[str], pages_map: Dict[str, str]) -> str:
+    populated = [q for q in queries if q and q.strip()]
+    summary_text = summary or ""
+    prompt_sections: List[str] = []
+    prompt_sections.append("<research_topic_summary>\n" + summary_text + "\n</research_topic_summary>")
+    if populated:
+        prompt_sections.append("<search_queries>\n" + "\n".join(populated) + "\n</search_queries>")
+    sources: List[str] = []
+    for idx, (url, text) in enumerate(pages_map.items(), start=1):
+        content = text.strip()
+        if not content:
+            continue
+        sources.append(f"[Source {idx}] URL: {url}\n\n{content}")
+    sources_joined, truncated = _truncate_join(sources, max_chars=60_000)
+    prompt_sections.append("<candidate_sources>\n" + sources_joined + ("\n\n[NOTE] Sources truncated due to context limits." if truncated else "") + "\n</candidate_sources>")
+    prompt_sections.append(
+        "<task>\nIdentify which of the provided URLs should be retained for the final research synthesis. "
+        "Consider coverage, credibility, and relevance to the research topic. "
+        "Return ONLY the URLs you choose, with one URL per line and no additional text.\n</task>"
+    )
+    return "\n\n".join(prompt_sections)
+def _parse_filterer_output(raw: str, allowed_urls: List[str]) -> List[str]:
+    if not raw:
+        return []
+    allowed_set = {url.strip(): idx for idx, url in enumerate(allowed_urls)}
+    found_indices: set[int] = set()
+    for line in raw.splitlines():
+        candidate = line.strip()
+        if not candidate:
+            continue
+        if candidate in allowed_set:
+            found_indices.add(allowed_set[candidate])
+            continue
+        match = re.search(r"https?://[^\s]+", candidate)
+        if not match:
+            continue
+        url = match.group(0).rstrip(".,);]")
+        if url in allowed_set:
+            found_indices.add(allowed_set[url])
+    selected = [allowed_urls[idx] for idx in sorted(found_indices)]
+    return selected
 def _write_report_tmp(text: str) -> str:
     tmp_dir = tempfile.mkdtemp(prefix="deep_research_")
     path = os.path.join(tmp_dir, "research_report.txt")
     return path
+def _fetch_pages_within_budget(urls: List[str], char_limit: int, time_left_fn: Callable[[], float]) -> OrderedDict:
+    pages: dict[str, str] = {}
+    if not urls:
+        return OrderedDict()
+    queue = deque(urls)
+    attempts: dict[str, int] = {url: 0 for url in urls}
+    max_attempts = 2
+    max_workers = min(12, max(4, len(urls)))
+    in_flight: dict[Future, str] = {}
+    delayed: list[tuple[float, str]] = []
+    def schedule_next(executor: ThreadPoolExecutor) -> None:
+        while queue and len(in_flight) < max_workers:
+            url = queue.popleft()
+            if url in pages:
+                continue
+            attempts.setdefault(url, 0)
+            if attempts[url] >= max_attempts:
+                continue
+            attempts[url] += 1
+            tl = time_left_fn()
+            if tl <= 0.1:
+                return
+            per_timeout = 10.0 if tl > 15 else (5.0 if tl > 8 else 2.0)
+            future = executor.submit(_fetch_page_markdown_fast, url, char_limit, per_timeout)
+            in_flight[future] = url
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        schedule_next(executor)
+        while (in_flight or queue or delayed) and time_left_fn() > 0.2:
+            now = time.time()
+            if delayed:
+                ready: list[tuple[float, str]] = []
+                not_ready: list[tuple[float, str]] = []
+                for ready_time, delayed_url in delayed:
+                    (ready if ready_time <= now else not_ready).append((ready_time, delayed_url))
+                delayed = not_ready
+                for _, delayed_url in ready:
+                    queue.append(delayed_url)
+                if ready:
+                    schedule_next(executor)
+            done = [future for future in list(in_flight.keys()) if future.done()]
+            if not done:
+                if not queue and delayed:
+                    next_ready = min((t for t, _ in delayed), default=time.time())
+                    sleep_for = max(0.0, next_ready - time.time())
+                    time.sleep(max(0.02, min(0.25, sleep_for)))
+                else:
+                    time.sleep(0.05)
+                continue
+            for future in done:
+                url = in_flight.pop(future)
+                try:
+                    md = future.result()
+                    if md and not md.startswith("Unsupported content type") and not md.startswith("An error occurred"):
+                        pages[url] = md
+                        try:
+                            print(f"[FETCH OK] {url} (chars={len(md)})", flush=True)
+                        except Exception:
+                            pass
+                except SlowHost:
+                    if time_left_fn() > 5.0:
+                        delayed.append((time.time() + 3.0, url))
+                except Exception:
+                    pass
+            schedule_next(executor)
+    ordered = OrderedDict((url, pages[url]) for url in urls if url in pages)
+    return ordered
 @autodoc(
     summary=TOOL_SUMMARY,
 )
     def time_left() -> float:
         return max(0.0, deadline - time.time())
+    now_dt = datetime.now().astimezone()
+    date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
+    if not date_str:
+        date_str = now_dt.isoformat()
     all_urls: list[str] = []
     tasks = []
     with ThreadPoolExecutor(max_workers=min(5, sum(1 for q in queries if q.strip())) or 1) as executor:
         return any(path.endswith(ext) for ext in skip_exts)
     all_urls = [url for url in all_urls if not _skip_url(url)]
+    truncated_pages = OrderedDict()
+    if all_urls and time_left() > 0.2:
+        truncated_pages = _fetch_pages_within_budget(all_urls, 3000, time_left)
+    print(
+        f"[PIPELINE] Initial fetch complete: candidates={len(all_urls)}, truncated_documents={len(truncated_pages)}, time_left={time_left():.2f}s",
+        flush=True,
+    )
+    def _invoke_chat(messages, provider: str, max_tokens: int, temp: float, top_p: float):
+        client = InferenceClient(provider=provider, api_key=HF_TEXTGEN_TOKEN)
+        return client.chat.completions.create(
+            model="Qwen/Qwen3-235B-A22B-Thinking-2507",
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temp,
+            top_p=top_p,
+        )
+    filtered_urls: List[str] = list(truncated_pages.keys())
+    filter_output = ""
+    filter_used_fallback = False
+    filter_success = False
+    if truncated_pages and time_left() > 3.0:
+        filter_prompt = _build_filter_prompt(summary or "", [q for q in queries if q.strip()], truncated_pages)
+        filter_messages = [
+            {"role": "system", "content": FILTERER_SYSTEM_PROMPT},
+            {"role": "user", "content": f"The current date is {date_str}. Consider how recent each source is when deciding relevance."},
+            {"role": "user", "content": filter_prompt},
+        ]
+        filter_completion = None
+        try:
+            print("[FILTER] Attempt 1: provider=cerebras, max_tokens=2048", flush=True)
+            filter_completion = _invoke_chat(filter_messages, "cerebras", 2048, 0.2, 0.9)
+        except Exception as exc1:
+            print(f"[FILTER] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
+            try:
+                print("[FILTER] Attempt 2: provider=auto, max_tokens=2048", flush=True)
+                filter_completion = _invoke_chat(filter_messages, "auto", 2048, 0.2, 0.9)
+            except Exception as exc2:
+                print(f"[FILTER] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
+        if filter_completion and filter_completion.choices:
+            filter_output = filter_completion.choices[0].message.content or ""
+            filtered_urls = _parse_filterer_output(filter_output, list(truncated_pages.keys()))
+            filter_success = bool(filter_output.strip()) and bool(filtered_urls)
+    if not filtered_urls:
+        filter_used_fallback = True
+        fallback_count = min(8, len(truncated_pages))
+        filtered_urls = list(truncated_pages.keys())[:fallback_count]
+    max_final_urls = 20
+    if len(filtered_urls) > max_final_urls:
+        filter_used_fallback = True
+        filtered_urls = filtered_urls[:max_final_urls]
+    if not filter_success:
+        filter_used_fallback = True
+    print(
+        f"[FILTER] Selected URLs={len(filtered_urls)}, fallback={filter_used_fallback}, time_left={time_left():.2f}s",
+        flush=True,
+    )
+    final_pages_fetched = OrderedDict()
+    if filtered_urls and time_left() > 0.2:
+        final_pages_fetched = _fetch_pages_within_budget(filtered_urls, 8000, time_left)
+    merged_pages = OrderedDict()
+    for url in filtered_urls:
+        content = final_pages_fetched.get(url) or truncated_pages.get(url) or ""
+        if content:
+            merged_pages[url] = content
+    pages = merged_pages
+    print(
+        f"[PIPELINE] Final fetch complete: retained_documents={len(pages)}, time_left={time_left():.2f}s",
+        flush=True,
+    )
     prompt = _build_research_prompt(summary=summary or "", queries=[q for q in queries if q.strip()], url_list=list(pages.keys()), pages_map=pages)
     system_message = {"role": "system", "content": RESEARCHER_SYSTEM_PROMPT}
     date_message = {"role": "user", "content": f"The current date is {date_str}. Return only the research report."}
     messages = [
     print(f"[PIPELINE] Fetch complete: pages={len(pages)}, unique_urls={len(pages.keys())}, prompt_chars={prompt_chars}", flush=True)
     print("[PIPELINE] Starting inference (provider=cerebras, model=Qwen/Qwen3-235B-A22B-Thinking-2507)", flush=True)
     try:
         print("[LLM] Attempt 1: provider=cerebras, max_tokens=32768", flush=True)
+        completion = _invoke_chat(messages, "cerebras", max_tokens=32768, temp=0.3, top_p=0.95)
     except Exception as exc1:
         print(f"[LLM] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
         try:
                 {"role": "user", "content": prompt2},
             ]
             print("[LLM] Attempt 2: provider=cerebras (trimmed), max_tokens=16384", flush=True)
+            completion = _invoke_chat(messages, "cerebras", max_tokens=16384, temp=0.7, top_p=0.95)
         except Exception as exc2:
             print(f"[LLM] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
             try:
                 print("[LLM] Attempt 3: provider=auto, max_tokens=8192", flush=True)
+                completion = _invoke_chat(messages, "auto", max_tokens=8192, temp=0.7, top_p=0.95)
             except Exception as exc3:
                 _log_call_end("Deep_Research", f"error={_truncate_for_log(str(exc3), 260)}")
                 raise gr.Error(f"Researcher model call failed: {exc3}")
     except Exception:
         pass
     links_text = "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
+    if links_text:
+        sources_section = "\n\n## Sources\n" + "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
+        report = report.rstrip() + sources_section
     file_path = _write_report_tmp(report)
     elapsed = time.time() - start_ts
     print(f"[TIMING] Deep_Research elapsed: {elapsed:.2f}s", flush=True)