Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on 26 days ago

Commit

a9decab

verified ·

1 Parent(s): 46b04f4

adding new HTML fetching mode

Browse files

Files changed (1) hide show

Modules/Web_Fetch.py +65 -35

Modules/Web_Fetch.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import re
-from typing import Annotated, Dict, List, Tuple
 from urllib.parse import urlparse, urljoin
 import gradio as gr
@@ -16,10 +16,12 @@ from ._docstrings import autodoc
 # Single source of truth for the LLM-facing tool description
 TOOL_SUMMARY = (
-    "Fetch a webpage and return clean Markdown or a list of links, with max length and pagination via offset; "
-    "if truncated, the output includes a notice with next_cursor for exact continuation."
 )
 def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
     headers = {
@@ -82,6 +84,21 @@ def _domain_of(url: str) -> str:
         return ""
 def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
     links = []
     for link in soup.find_all("a", href=True):
@@ -134,16 +151,16 @@ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_
     return markdown_text or "No content could be extracted."
-def _truncate_markdown(markdown: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
-    total_chars = len(markdown)
     if total_chars <= max_chars:
-        return markdown, {
             "truncated": False,
             "returned_chars": total_chars,
             "total_chars_estimate": total_chars,
             "next_cursor": None,
         }
-    truncated = markdown[:max_chars]
     last_paragraph = truncated.rfind("\n\n")
     if last_paragraph > max_chars * 0.7:
         truncated = truncated[:last_paragraph]
@@ -180,14 +197,18 @@ def Web_Fetch(
     max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
     offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
     strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
-    url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
 ) -> str:
     _log_call_start(
         "Web_Fetch",
         url=url,
         max_chars=max_chars,
         strip_selectors=strip_selectors,
-        url_scraper=url_scraper,
         offset=offset,
     )
     if not url or not url.strip():
@@ -210,31 +231,35 @@ def Web_Fetch(
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
     full_soup = BeautifulSoup(html, "lxml")
-    if url_scraper:
-        result = _extract_links_from_soup(full_soup, final_url)
-        if offset > 0:
-            result = result[offset:]
-        if max_chars > 0 and len(result) > max_chars:
-            result, _ = _truncate_markdown(result, max_chars)
-    else:
         full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
         if offset > 0:
-            if offset >= len(full_result):
-                result = (
-                    f"Offset {offset} exceeds content length ({len(full_result)} characters). "
-                    f"Content ends at position {len(full_result)}."
-                )
-                _log_call_end("Web_Fetch", _truncate_for_log(result))
-                return result
-            result = full_result[offset:]
-        else:
-            result = full_result
-        if max_chars > 0 and len(result) > max_chars:
-            result, metadata = _truncate_markdown(result, max_chars)
-            if offset > 0:
-                metadata["total_chars_estimate"] = len(full_result)
-                metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
-    _log_call_end("Web_Fetch", f"chars={len(result)}, url_scraper={url_scraper}, offset={offset}")
     return result
@@ -259,13 +284,18 @@ def build_interface() -> gr.Interface:
                 max_lines=1,
                 info="CSS selectors to remove (comma-separated)",
             ),
-            gr.Checkbox(label="URL Scraper", value=False, info="Extract only links instead of content"),
         ],
         outputs=gr.Markdown(label="Extracted Content"),
         title="Web Fetch",
         description=(
-            "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, "
-            "or extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
         ),
         api_description=TOOL_SUMMARY,
         flagging_mode="never",

 from __future__ import annotations
 import re
+from typing import Annotated, Dict, Literal, Tuple
 from urllib.parse import urlparse, urljoin
 import gradio as gr
 # Single source of truth for the LLM-facing tool description
 TOOL_SUMMARY = (
+    "Fetch a webpage and return clean Markdown, raw HTML, or a list of links, with max length and pagination via "
+    "offset; if truncated, the output includes a notice with next_cursor for exact continuation."
 )
+ModeOption = Literal["markdown", "html", "url_scraper"]
 def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
     headers = {
         return ""
+def _normalize_mode(mode: str | None) -> ModeOption:
+    """Convert UI-supplied labels into canonical mode values."""
+    if not mode:
+        return "markdown"
+    normalized = mode.strip().lower()
+    normalized = normalized.replace("-", "_").replace(" ", "_")
+    if normalized in {"markdown", "markdown_mode", "md"}:
+        return "markdown"
+    if normalized in {"html", "html_mode"}:
+        return "html"
+    if normalized in {"url_scraper", "urlscraper", "url_mode", "scraper", "links", "link_mode"}:
+        return "url_scraper"
+    return "markdown"
 def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
     links = []
     for link in soup.find_all("a", href=True):
     return markdown_text or "No content could be extracted."
+def _truncate_with_notice(content: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
+    total_chars = len(content)
     if total_chars <= max_chars:
+        return content, {
             "truncated": False,
             "returned_chars": total_chars,
             "total_chars_estimate": total_chars,
             "next_cursor": None,
         }
+    truncated = content[:max_chars]
     last_paragraph = truncated.rfind("\n\n")
     if last_paragraph > max_chars * 0.7:
         truncated = truncated[:last_paragraph]
     max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
     offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
     strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
+    mode: Annotated[
+        str,
+        "Output mode: 'markdown' (default, clean content), 'html' (raw response), or 'url_scraper' (links list).",
+    ] = "markdown",
 ) -> str:
+    canonical_mode = _normalize_mode(mode)
     _log_call_start(
         "Web_Fetch",
         url=url,
         max_chars=max_chars,
         strip_selectors=strip_selectors,
+        mode=canonical_mode,
         offset=offset,
     )
     if not url or not url.strip():
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
     full_soup = BeautifulSoup(html, "lxml")
+    if canonical_mode == "html":
+        _log_call_end("Web_Fetch", f"chars={len(html)}, mode={canonical_mode}, offset=0 (ignored)")
+        return html
+    if canonical_mode == "markdown":
         full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
+    elif canonical_mode == "url_scraper":
+        full_result = _extract_links_from_soup(full_soup, final_url)
+    else:
+        full_result = html
+    if offset > 0:
+        if offset >= len(full_result):
+            result = (
+                f"Offset {offset} exceeds content length ({len(full_result)} characters). "
+                f"Content ends at position {len(full_result)}."
+            )
+            _log_call_end("Web_Fetch", _truncate_for_log(result))
+            return result
+        result = full_result[offset:]
+    else:
+        result = full_result
+    if max_chars > 0 and len(result) > max_chars:
+        result, metadata = _truncate_with_notice(result, max_chars)
         if offset > 0:
+            metadata["total_chars_estimate"] = len(full_result)
+            metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
+    _log_call_end("Web_Fetch", f"chars={len(result)}, mode={canonical_mode}, offset={offset}")
     return result
                 max_lines=1,
                 info="CSS selectors to remove (comma-separated)",
             ),
+            gr.Radio(
+                label="Mode",
+                choices=["Markdown Mode", "HTML Mode", "URL Scraper"],
+                value="Markdown Mode",
+                info="Markdown cleans content, HTML returns raw response, URL Scraper lists links.",
+            ),
         ],
         outputs=gr.Markdown(label="Extracted Content"),
         title="Web Fetch",
         description=(
+            "<div style=\"text-align:center\">Convert any webpage to Markdown, inspect the raw HTML response, or "
+            "extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
         ),
         api_description=TOOL_SUMMARY,
         flagging_mode="never",