Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Sep 1

Commit

ed7ddca

verified ·

1 Parent(s): ee1c18d

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -32

app.py CHANGED Viewed

@@ -41,20 +41,138 @@ except Exception:  # pragma: no cover - optional dependency
 # ==============================
-# Fetch: HTTP + extraction utils
 # ==============================
-def _http_get(url: str) -> requests.Response:
     """
-    Download the page politely with a short timeout and realistic headers.
-    (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
     headers = {
-        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
         "Accept-Language": "en-US,en;q=0.9",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
-    return requests.get(url, headers=headers, timeout=15)
 def _normalize_whitespace(text: str) -> str:
@@ -355,7 +473,7 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
         return "Please enter a valid URL."
     try:
-        resp = _http_get(url)
         resp.raise_for_status()
     except requests.exceptions.RequestException as e:
         return f"An error occurred: {e}"
@@ -377,8 +495,8 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     if full_page_markdown:
         return _fullpage_markdown_from_soup(full_soup, final_url)
-    # Readable content
-    body_text, readable_soup = _extract_main_text(html)
     if not body_text:
         # Fallback to "whole-page text" if Readability found nothing
         fallback_text = full_soup.get_text(" ", strip=True)
@@ -408,9 +526,37 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
 # ============================================
-# DuckDuckGo Search: ultra-succinct JSONL
 # ============================================
 def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."] ,
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
@@ -418,10 +564,12 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
 ) -> str:
     """
-    Run a DuckDuckGo search and return ultra-compact JSONL with short keys to
-    minimize tokens.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
@@ -430,22 +578,50 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
     Returns:
-        str: Newline-delimited JSON (JSONL). Each line has:
-            {"t": "title", "u": "url"[, "s": "snippet"]}
     """
     if not query or not query.strip():
-        return ""
     try:
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
-        return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
-    lines: List[str] = []
     for r in raw or []:
         title = _shorten((r.get("title") or "").strip(), title_chars)
@@ -461,16 +637,41 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
                 continue
             seen_domains.add(dom)
-        obj = {"t": title or _domain_of(url), "u": url}
         if include_snippets and body:
-            obj["s"] = _shorten(body, max_snippet_chars)
-        # Emit most compact JSON possible (no spaces)
-        lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
-    # Join as JSONL (each result on its own line)
-    return "\n".join(lines)
 # ======================================
@@ -762,16 +963,17 @@ concise_interface = gr.Interface(
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
     ],
-    outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
     title="DuckDuckGo Search",
     description=(
-        "<div style=\"text-align:center\">Very concise web search to avoid unnecessary context. Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.</div>"
     ),
     api_description=(
-        "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
-        "t=title, u=url, optional s=snippet. Options control result count, "
-        "snippet inclusion and length, domain deduping, and title length."
     ),
     allow_flagging="never",
     submit_btn="Search",

 # ==============================
+# Fetch: Enhanced HTTP + extraction utils
 # ==============================
+def _http_get_enhanced(url: str) -> requests.Response:
     """
+    Download the page with enhanced headers, timeout handling, and better error recovery.
     """
     headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
         "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br",
+        "DNT": "1",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
     }
+    # Apply rate limiting
+    _fetch_rate_limiter.acquire()
+    try:
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=30,  # Increased timeout
+            allow_redirects=True,
+            stream=False
+        )
+        response.raise_for_status()
+        return response
+    except requests.exceptions.Timeout:
+        raise requests.exceptions.RequestException("Request timed out. The webpage took too long to respond.")
+    except requests.exceptions.ConnectionError:
+        raise requests.exceptions.RequestException("Connection error. Please check the URL and your internet connection.")
+    except requests.exceptions.HTTPError as e:
+        if response.status_code == 403:
+            raise requests.exceptions.RequestException("Access forbidden. The website may be blocking automated requests.")
+        elif response.status_code == 404:
+            raise requests.exceptions.RequestException("Page not found. Please check the URL.")
+        elif response.status_code == 429:
+            raise requests.exceptions.RequestException("Rate limited. Please try again in a few minutes.")
+        else:
+            raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
+def _extract_main_text_enhanced(html: str) -> Tuple[str, BeautifulSoup]:
+    """
+    Enhanced main text extraction with better fallback mechanisms.
+    """
+    try:
+        # Try Readability first
+        doc = Document(html)
+        readable_html = doc.summary(html_partial=True)
+        if readable_html and readable_html.strip():
+            soup = BeautifulSoup(readable_html, "lxml")
+            # Remove noisy tags more comprehensively
+            for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside", "[role='banner']", "[role='navigation']", "[role='complementary']"]:
+                for tag in soup.select(sel):
+                    tag.decompose()
+            # Extract text with better structure preservation
+            text_parts = []
+            for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "div"]):
+                chunk = element.get_text(" ", strip=True)
+                if chunk and len(chunk) > 15:  # Only include substantial content
+                    # Avoid repetitive disclaimers
+                    if not ("responses are generated using ai" in chunk.lower() and len(chunk) < 100):
+                        text_parts.append(chunk)
+            if text_parts:
+                clean_text = _normalize_whitespace("\n\n".join(text_parts))
+                # Check if we got substantial content
+                if len(clean_text) > 100:
+                    return clean_text, soup
+    except Exception:
+        pass  # Fall through to backup extraction
+    # Fallback: Parse original HTML more carefully
+    try:
+        full_soup = BeautifulSoup(html, "lxml")
+        # Remove unwanted elements
+        for element in full_soup.select("script, style, nav, footer, header, aside, [role='banner'], [role='navigation'], [role='complementary']"):
+            element.decompose()
+        # Try to find main content areas
+        main_content = (
+            full_soup.find("main")
+            or full_soup.find("article")
+            or full_soup.find("div", class_=re.compile(r"content|main|post|article|body", re.I))
+            or full_soup.find("div", id=re.compile(r"content|main|post|article|body", re.I))
+            or full_soup.find("section", class_=re.compile(r"content|main|post|article|body", re.I))
+            or full_soup.find("body")
+            or full_soup
+        )
+        if main_content:
+            # More aggressive removal of common noise patterns
+            for element in main_content.select(".disclaimer, .warning, .alert, .notice, [class*='cookie'], [class*='banner'], [id*='cookie'], [id*='banner']"):
+                element.decompose()
+            text = main_content.get_text(" ", strip=True)
+            text = _normalize_whitespace(text)
+            # Filter out repetitive text
+            lines = text.split('\n')
+            filtered_lines = []
+            seen_lines = set()
+            for line in lines:
+                line_clean = line.strip()
+                if len(line_clean) > 10 and line_clean not in seen_lines:
+                    # Skip common disclaimers and repetitive content
+                    if not ("responses are generated using ai" in line_clean.lower() and len(line_clean) < 100):
+                        filtered_lines.append(line)
+                        seen_lines.add(line_clean)
+            clean_text = '\n'.join(filtered_lines)
+            # Create a minimal soup for link extraction
+            minimal_soup = BeautifulSoup(str(main_content), "lxml")
+            return clean_text, minimal_soup
+    except Exception:
+        pass
+    # Last resort: Just get all text
+    fallback_soup = BeautifulSoup(html, "lxml")
+    text = fallback_soup.get_text(" ", strip=True)
+    return _normalize_whitespace(text), fallback_soup
 def _normalize_whitespace(text: str) -> str:
         return "Please enter a valid URL."
     try:
+        resp = _http_get_enhanced(url)
         resp.raise_for_status()
     except requests.exceptions.RequestException as e:
         return f"An error occurred: {e}"
     if full_page_markdown:
         return _fullpage_markdown_from_soup(full_soup, final_url)
+    # Readable content with enhanced extraction
+    body_text, readable_soup = _extract_main_text_enhanced(html)
     if not body_text:
         # Fallback to "whole-page text" if Readability found nothing
         fallback_text = full_soup.get_text(" ", strip=True)
 # ============================================
+# DuckDuckGo Search: Enhanced with error handling & rate limiting
 # ============================================
+import asyncio
+from datetime import datetime, timedelta
+class RateLimiter:
+    def __init__(self, requests_per_minute: int = 30):
+        self.requests_per_minute = requests_per_minute
+        self.requests = []
+    def acquire(self):
+        """Synchronous rate limiting for non-async context"""
+        now = datetime.now()
+        # Remove requests older than 1 minute
+        self.requests = [
+            req for req in self.requests if now - req < timedelta(minutes=1)
+        ]
+        if len(self.requests) >= self.requests_per_minute:
+            # Wait until we can make another request
+            wait_time = 60 - (now - self.requests[0]).total_seconds()
+            if wait_time > 0:
+                time.sleep(max(1, wait_time))  # At least 1 second wait
+        self.requests.append(now)
+# Global rate limiters
+_search_rate_limiter = RateLimiter(requests_per_minute=20)
+_fetch_rate_limiter = RateLimiter(requests_per_minute=25)
 def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."] ,
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
+    output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
 ) -> str:
     """
+    Run a DuckDuckGo search with enhanced error handling and multiple output formats.
+    Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text
+    format optimized for LLM consumption with better error messages.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
+        output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
     Returns:
+        str: Either JSONL format with {"t": "title", "u": "url"[, "s": "snippet"]}
+             or readable text format for better LLM consumption.
     """
     if not query or not query.strip():
+        error_msg = "No search query provided. Please enter a search term."
+        if output_format == "readable":
+            return error_msg
+        return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
+    # Validate max_results
+    max_results = max(1, min(20, max_results))
     try:
+        # Apply rate limiting to avoid being blocked
+        _search_rate_limiter.acquire()
+        # Perform search with timeout handling
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
+        error_msg = f"Search failed: {str(e)[:200]}"
+        if "blocked" in str(e).lower() or "rate" in str(e).lower():
+            error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
+        elif "timeout" in str(e).lower():
+            error_msg = "Search timed out. Please try again with a simpler query."
+        elif "network" in str(e).lower() or "connection" in str(e).lower():
+            error_msg = "Network connection error. Please check your internet connection and try again."
+        if output_format == "readable":
+            return f"Error: {error_msg}"
+        return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
+    if not raw:
+        no_results_msg = f"No results found for query: {query}"
+        if output_format == "readable":
+            return no_results_msg
+        return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
+    results = []
     for r in raw or []:
         title = _shorten((r.get("title") or "").strip(), title_chars)
                 continue
             seen_domains.add(dom)
+        result_obj = {
+            "title": title or _domain_of(url),
+            "url": url
+        }
         if include_snippets and body:
+            result_obj["snippet"] = _shorten(body, max_snippet_chars)
+        results.append(result_obj)
+    if not results:
+        no_results_msg = f"No valid results found for query: {query}"
+        if output_format == "readable":
+            return no_results_msg
+        return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
+    # Format output based on requested format
+    if output_format == "readable":
+        lines = [f"Found {len(results)} search results for: {query}\n"]
+        for i, result in enumerate(results, 1):
+            lines.append(f"{i}. {result['title']}")
+            lines.append(f"   URL: {result['url']}")
+            if "snippet" in result:
+                lines.append(f"   Summary: {result['snippet']}")
+            lines.append("")  # Empty line between results
+        return "\n".join(lines)
+    else:
+        # JSONL format with compact keys
+        lines = []
+        for result in results:
+            obj = {"t": result["title"], "u": result["url"]}
+            if "snippet" in result:
+                obj["s"] = result["snippet"]
+            lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
+        return "\n".join(lines)
 # ======================================
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
+        gr.Radio(label="Output format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact JSON, readable for LLM-friendly text"),
     ],
+    outputs=gr.Textbox(label="Search Results", interactive=False),
     title="DuckDuckGo Search",
     description=(
+        "<div style=\"text-align:center\">Enhanced web search with better error handling and multiple output formats. JSONL format emits compact keys (t,u[,s]), readable format provides LLM-friendly text.</div>"
     ),
     api_description=(
+        "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
+        "Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text "
+        "format optimized for LLM consumption with better error messages."
     ),
     allow_flagging="never",
     submit_btn="Search",