Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Sep 1

Commit

ee1c18d

verified ·

1 Parent(s): 8fe706f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -183

app.py CHANGED Viewed

@@ -13,23 +13,19 @@ import json
 import sys
 import os
 import random
-import asyncio
-import time
 from io import StringIO
 from typing import List, Dict, Tuple, Annotated
-from datetime import datetime, timedelta
-from dataclasses import dataclass
 import gradio as gr
 import requests
-import httpx
 from bs4 import BeautifulSoup
-from markdownify import markdownify as md # type: ignore
-from readability import Document # type: ignore
 from urllib.parse import urljoin, urldefrag, urlparse
 from ddgs import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
@@ -44,94 +40,6 @@ except Exception:  # pragma: no cover - optional dependency
     KPipeline = None  # type: ignore
-# ==============================
-# Rate Limiting and HTTP Utils
-# ==============================
-@dataclass
-class SearchResult:
-    title: str
-    link: str
-    snippet: str
-    position: int
-class RateLimiter:
-    """Rate limiter to prevent being blocked by services"""
-    def __init__(self, requests_per_minute: int = 30):
-        self.requests_per_minute = requests_per_minute
-        self.requests = []
-    async def acquire(self):
-        now = datetime.now()
-        # Remove requests older than 1 minute
-        self.requests = [
-            req for req in self.requests if now - req < timedelta(minutes=1)
-        ]
-        if len(self.requests) >= self.requests_per_minute:
-            # Wait until we can make another request
-            wait_time = 60 - (now - self.requests[0]).total_seconds()
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
-        self.requests.append(now)
-class ImprovedWebFetcher:
-    """Improved web fetcher with rate limiting and async support"""
-    def __init__(self):
-        self.rate_limiter = RateLimiter(requests_per_minute=20)
-        self.headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-            "Accept-Language": "en-US,en;q=0.9",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        }
-    async def fetch_async(self, url: str) -> Tuple[str, str, str]:
-        """
-        Fetch URL with rate limiting and proper error handling
-        Returns: (html_content, final_url, error_message)
-        """
-        await self.rate_limiter.acquire()
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    url,
-                    headers=self.headers,
-                    follow_redirects=True,
-                    timeout=30.0,
-                )
-                response.raise_for_status()
-                return response.text, str(response.url), ""
-        except httpx.TimeoutException:
-            return "", "", f"Request timed out for URL: {url}"
-        except httpx.HTTPError as e:
-            return "", "", f"HTTP error occurred: {str(e)}"
-        except Exception as e:
-            return "", "", f"Unexpected error: {str(e)}"
-# Global instances
-_web_fetcher = ImprovedWebFetcher()
-_search_rate_limiter = RateLimiter(requests_per_minute=30)
-# Simple sync rate limiting for backwards compatibility
-_last_request_time = 0
-_min_request_interval = 2  # seconds between requests
-def _apply_rate_limit():
-    """Simple synchronous rate limiting"""
-    global _last_request_time
-    current_time = time.time()
-    elapsed = current_time - _last_request_time
-    if elapsed < _min_request_interval:
-        time.sleep(_min_request_interval - elapsed)
-    _last_request_time = time.time()
 # ==============================
 # Fetch: HTTP + extraction utils
 # ==============================
@@ -139,18 +47,14 @@ def _apply_rate_limit():
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
-    Enhanced with better error handling, headers from ddg-search patterns, and rate limiting.
     (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
-    # Apply rate limiting to avoid being blocked
-    _apply_rate_limit()
     headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
         "Accept-Language": "en-US,en;q=0.9",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
-    return requests.get(url, headers=headers, timeout=30, allow_redirects=True)
 def _normalize_whitespace(text: str) -> str:
@@ -256,8 +160,8 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     # Parse simplified HTML
     s = BeautifulSoup(readable_html, "lxml")
-    # Remove noisy tags (improved from ddg-search patterns)
-    for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside"]:
         for tag in s.select(sel):
             tag.decompose()
@@ -453,30 +357,17 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     try:
         resp = _http_get(url)
         resp.raise_for_status()
-    except requests.exceptions.Timeout:
-        return f"Error: Request timed out while fetching {url}. Please try again or check if the website is accessible."
-    except requests.exceptions.ConnectionError:
-        return f"Error: Could not connect to {url}. Please check the URL and your internet connection."
-    except requests.exceptions.HTTPError as e:
-        return f"Error: HTTP {e.response.status_code} - {e.response.reason} when accessing {url}"
     except requests.exceptions.RequestException as e:
-        return f"Error: Failed to fetch webpage ({str(e)}). Please check the URL and try again."
-    except Exception as e:
-        return f"Error: An unexpected error occurred while fetching the webpage ({str(e)})"
     final_url = str(resp.url)
     ctype = resp.headers.get("Content-Type", "")
     if "html" not in ctype.lower():
-        return f"Unsupported content type for extraction: {ctype or 'unknown'}. This tool only works with HTML pages."
-    # Decode to text with better encoding detection
-    if not resp.encoding:
-        resp.encoding = resp.apparent_encoding or 'utf-8'
-    try:
-        html = resp.text
-    except UnicodeDecodeError:
-        # Fallback encoding handling
-        html = resp.content.decode('utf-8', errors='replace')
     # Full-page soup for metadata (and potential Markdown conversion)
     full_soup = BeautifulSoup(html, "lxml")
@@ -527,11 +418,10 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
-    output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
 ) -> str:
     """
-    Run a DuckDuckGo search and return results in ultra-compact JSONL format
-    or readable format optimized for LLM consumption.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
@@ -540,36 +430,24 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
-        output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
     Returns:
-        str: Either JSONL with {"t": "title", "u": "url"[, "s": "snippet"]} per line
-        or readable text format optimized for LLM processing.
     """
     if not query or not query.strip():
-        return "No search query provided." if output_format == "readable" else ""
-    # Apply rate limiting to avoid being blocked by DuckDuckGo
-    _apply_rate_limit()
     try:
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
-        error_msg = f"Search failed: {str(e)[:120]}"
-        if output_format == "readable":
-            return f"Error: {error_msg}. This could be due to DuckDuckGo's bot detection or network issues. Please try rephrasing your search or try again in a few minutes."
-        return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
-    if not raw:
-        if output_format == "readable":
-            return f"No results found for query: {query}. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
-        return ""
     seen_domains = set()
-    results = []
-    for i, r in enumerate(raw or []):
         title = _shorten((r.get("title") or "").strip(), title_chars)
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
@@ -583,35 +461,16 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
                 continue
             seen_domains.add(dom)
-        if output_format == "readable":
-            results.append({
-                "position": len(results) + 1,
-                "title": title or _domain_of(url),
-                "url": url,
-                "snippet": _shorten(body, max_snippet_chars) if include_snippets and body else ""
-            })
-        else:
-            obj = {"t": title or _domain_of(url), "u": url}
-            if include_snippets and body:
-                obj["s"] = _shorten(body, max_snippet_chars)
-            results.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
-    if output_format == "readable":
-        if not results:
-            return f"No results found for query: {query}"
-        output = [f"Found {len(results)} search results for: {query}\n"]
-        for result in results:
-            output.append(f"{result['position']}. {result['title']}")
-            output.append(f"   URL: {result['url']}")
-            if result['snippet']:
-                output.append(f"   Summary: {result['snippet']}")
-            output.append("")  # Empty line between results
-        return "\n".join(output).rstrip()
-    else:
-        # Return JSONL format (original behavior)
-        return "\n".join(results)
 # ======================================
@@ -890,7 +749,7 @@ fetch_interface = gr.Interface(
     "'Full-page Markdown (Content Scraper mode)' option to return the page "
     "converted to Markdown."
     ),
-    flagging_mode="never",
 )
 # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
@@ -903,19 +762,18 @@ concise_interface = gr.Interface(
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
-        gr.Dropdown(label="Output Format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact output, Readable for LLM-friendly format"),
     ],
-    outputs=gr.Textbox(label="Search Results", interactive=False),
     title="DuckDuckGo Search",
     description=(
-        "<div style=\"text-align:center\">Enhanced web search with improved error handling and LLM-friendly output options. Choose JSONL for compact results or Readable format for better LLM processing.</div>"
     ),
     api_description=(
-        "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
-        "Returns either compact JSONL (t=title, u=url, optional s=snippet) or "
-        "readable text format optimized for LLM consumption with better error messages."
     ),
-    flagging_mode="never",
     submit_btn="Search",
 )
@@ -937,7 +795,7 @@ code_interface = gr.Interface(
         "Returns:\n"
         "- string: Combined stdout produced by the code, or the exception text if execution failed."
     ),
-    flagging_mode="never",
 )
 CSS_STYLES = """
@@ -1002,7 +860,7 @@ kokoro_interface = gr.Interface(
         "Can generate audio of unlimited length by processing all text segments. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    flagging_mode="never",
 )
 # ==========================
@@ -1128,7 +986,7 @@ image_generation_interface = gr.Interface(
         "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    flagging_mode="never",
 )
 # ==========================
@@ -1305,7 +1163,7 @@ video_generation_interface = gr.Interface(
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
         "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    flagging_mode="never",
 )
 # Build tabbed app; disable Image/Video tools if no HF token is present

 import sys
 import os
 import random
 from io import StringIO
 from typing import List, Dict, Tuple, Annotated
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+from readability import Document
 from urllib.parse import urljoin, urldefrag, urlparse
 from ddgs import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
+import time
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
     KPipeline = None  # type: ignore
 # ==============================
 # Fetch: HTTP + extraction utils
 # ==============================
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
     (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
     headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
         "Accept-Language": "en-US,en;q=0.9",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
+    return requests.get(url, headers=headers, timeout=15)
 def _normalize_whitespace(text: str) -> str:
     # Parse simplified HTML
     s = BeautifulSoup(readable_html, "lxml")
+    # Remove noisy tags
+    for sel in ["script", "style", "noscript", "iframe", "svg"]:
         for tag in s.select(sel):
             tag.decompose()
     try:
         resp = _http_get(url)
         resp.raise_for_status()
     except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
     final_url = str(resp.url)
     ctype = resp.headers.get("Content-Type", "")
     if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
+    # Decode to text
+    resp.encoding = resp.encoding or resp.apparent_encoding
+    html = resp.text
     # Full-page soup for metadata (and potential Markdown conversion)
     full_soup = BeautifulSoup(html, "lxml")
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
 ) -> str:
     """
+    Run a DuckDuckGo search and return ultra-compact JSONL with short keys to
+    minimize tokens.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
     Returns:
+        str: Newline-delimited JSON (JSONL). Each line has:
+            {"t": "title", "u": "url"[, "s": "snippet"]}
     """
     if not query or not query.strip():
+        return ""
     try:
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
+        return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
+    lines: List[str] = []
+    for r in raw or []:
         title = _shorten((r.get("title") or "").strip(), title_chars)
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
                 continue
             seen_domains.add(dom)
+        obj = {"t": title or _domain_of(url), "u": url}
+        if include_snippets and body:
+            obj["s"] = _shorten(body, max_snippet_chars)
+        # Emit most compact JSON possible (no spaces)
+        lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
+    # Join as JSONL (each result on its own line)
+    return "\n".join(lines)
 # ======================================
     "'Full-page Markdown (Content Scraper mode)' option to return the page "
     "converted to Markdown."
     ),
+    allow_flagging="never",
 )
 # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
     ],
+    outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
     title="DuckDuckGo Search",
     description=(
+        "<div style=\"text-align:center\">Very concise web search to avoid unnecessary context. Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.</div>"
     ),
     api_description=(
+        "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
+        "t=title, u=url, optional s=snippet. Options control result count, "
+        "snippet inclusion and length, domain deduping, and title length."
     ),
+    allow_flagging="never",
     submit_btn="Search",
 )
         "Returns:\n"
         "- string: Combined stdout produced by the code, or the exception text if execution failed."
     ),
+    allow_flagging="never",
 )
 CSS_STYLES = """
         "Can generate audio of unlimited length by processing all text segments. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    allow_flagging="never",
 )
 # ==========================
         "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    allow_flagging="never",
 )
 # ==========================
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
         "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    allow_flagging="never",
 )
 # Build tabbed app; disable Image/Video tools if no HF token is present