Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Aug 23

Commit

c730636

verified ·

1 Parent(s): f9ecb69

Create app.py

Browse files

Files changed (1) hide show

app.py +710 -0

app.py ADDED Viewed

	@@ -0,0 +1,710 @@

+# File: main/app.py
+# Purpose: One Space that offers five tools/tabs:
+#   1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
+#   2) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
+#   3) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
+#   4) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
+#   5) Generate Sitemap — LIMITED: grouped internal/external links with an optional per-domain cap (and a .md download)
+from __future__ import annotations
+import re
+import json
+import sys
+from io import StringIO
+from typing import List, Dict, Literal, Tuple
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from readability import Document
+from urllib.parse import urljoin, urldefrag, urlparse
+from langchain_community.tools import DuckDuckGoSearchResults
+from duckduckgo_search import DDGS
+# ==============================
+# Fetch: HTTP + extraction utils
+# ==============================
+def _http_get(url: str) -> requests.Response:
+    """
+    Download the page politely with a short timeout and realistic headers.
+    (Layman's terms: grab the web page like a normal browser would, but quickly.)
+    """
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    }
+    return requests.get(url, headers=headers, timeout=15)
+def _normalize_whitespace(text: str) -> str:
+    """
+    Squeeze extra spaces and blank lines to keep things compact.
+    (Layman's terms: tidy up the text so it’s not full of weird spacing.)
+    """
+    text = re.sub(r"[ \t\u00A0]+", " ", text)
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
+    return text.strip()
+def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
+    """
+    Cut text if it gets too long; return the text and whether we trimmed.
+    (Layman's terms: shorten long text and tell us if we had to cut it.)
+    """
+    if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
+        return text, False
+    return text[:max_chars].rstrip() + " …", True
+def _shorten(text: str, limit: int) -> str:
+    """
+    Hard cap a string with an ellipsis to keep tokens small.
+    (Layman's terms: force a string to a max length with an ellipsis.)
+    """
+    if limit <= 0 or len(text) <= limit:
+        return text
+    return text[: max(0, limit - 1)].rstrip() + "…"
+def _domain_of(url: str) -> str:
+    """
+    Show a friendly site name like "example.com".
+    (Layman's terms: pull the website's domain.)
+    """
+    try:
+        return urlparse(url).netloc or ""
+    except Exception:
+        return ""
+def _meta(soup: BeautifulSoup, name: str) -> str | None:
+    tag = soup.find("meta", attrs={"name": name})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _og(soup: BeautifulSoup, prop: str) -> str | None:
+    tag = soup.find("meta", attrs={"property": prop})
+    return tag.get("content") if tag and tag.has_attr("content") else None
+def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
+    """
+    Pull the useful bits: title, description, site name, canonical URL, language, etc.
+    (Layman's terms: gather page basics like title/description/address.)
+    """
+    meta: Dict[str, str] = {}
+    # Title preference: <title> > og:title > twitter:title
+    title_candidates = [
+        (soup.title.string if soup.title and soup.title.string else None),
+        _og(soup, "og:title"),
+        _meta(soup, "twitter:title"),
+    ]
+    meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
+    # Description preference: description > og:description > twitter:description
+    desc_candidates = [
+        _meta(soup, "description"),
+        _og(soup, "og:description"),
+        _meta(soup, "twitter:description"),
+    ]
+    meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
+    # Canonical link (helps dedupe)
+    link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
+    meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
+    # Site name + language info if present
+    meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
+    html_tag = soup.find("html")
+    meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
+    # Final URL + domain
+    meta["fetched_url"] = final_url
+    meta["domain"] = _domain_of(final_url)
+    return meta
+def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
+    """
+    Use Readability to isolate the main article and turn it into clean text.
+    Returns (clean_text, soup_of_readable_html).
+    (Layman's terms: find the real article text and clean it.)
+    """
+    # Simplified article HTML from Readability
+    doc = Document(html)
+    readable_html = doc.summary(html_partial=True)
+    # Parse simplified HTML
+    s = BeautifulSoup(readable_html, "lxml")
+    # Remove noisy tags
+    for sel in ["script", "style", "noscript", "iframe", "svg"]:
+        for tag in s.select(sel):
+            tag.decompose()
+    # Keep paragraphs, list items, and subheadings for structure without bloat
+    text_parts: List[str] = []
+    for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
+        chunk = p.get_text(" ", strip=True)
+        if chunk:
+            text_parts.append(chunk)
+    clean_text = _normalize_whitespace("\n\n".join(text_parts))
+    return clean_text, s
+def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
+    """
+    Collect clean, unique, absolute links from the readable section only.
+    (Layman's terms: pull a tidy list of links from the article body.)
+    """
+    seen = set()
+    links: List[Tuple[str, str]] = []
+    for a in readable_soup.find_all("a", href=True):
+        href = a.get("href").strip()
+        # Skip junk links we can't use
+        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
+            continue
+        # Resolve relative URLs, strip fragments (#…)
+        absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        if absolute in seen:
+            continue
+        seen.add(absolute)
+        text = a.get_text(" ", strip=True)
+        if len(text) > 120:
+            text = text[:117] + "…"
+        links.append((text or absolute, absolute))
+        if len(links) >= max_links > 0:
+            break
+    return links
+def _format_markdown(
+    meta: Dict[str, str],
+    body: str,
+    body_truncated: bool,
+    links: List[Tuple[str, str]],
+    include_text: bool,
+    include_metadata: bool,
+    include_links: bool,
+    verbosity: str,
+) -> str:
+    """
+    Assemble a compact Markdown summary with optional sections.
+    (Layman's terms: build the final markdown output with options.)
+    """
+    lines: List[str] = []
+    # Title header
+    title = meta.get("title") or meta.get("domain") or "Untitled"
+    lines.append(f"# {title}")
+    # Metadata section (only show what exists)
+    if include_metadata:
+        md: List[str] = []
+        if meta.get("description"):
+            md.append(f"- **Description:** {meta['description']}")
+        if meta.get("site_name"):
+            md.append(f"- **Site:** {meta['site_name']}")
+        if meta.get("canonical"):
+            md.append(f"- **Canonical:** {meta['canonical']}")
+        if meta.get("lang"):
+            md.append(f"- **Language:** {meta['lang']}")
+        if meta.get("fetched_url"):
+            md.append(f"- **Fetched From:** {meta['fetched_url']}")
+        if md:
+            lines.append("## Metadata")
+            lines.extend(md)
+    # Body text
+    if include_text and body:
+        if verbosity == "Brief":
+            brief, was_more = _truncate(body, 800)
+            lines.append("## Text")
+            lines.append(brief)
+            if was_more or body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+        else:
+            lines.append("## Text")
+            lines.append(body)
+            if body_truncated:
+                lines.append("\n> (Trimmed for brevity)")
+    # Links section
+    if include_links and links:
+        lines.append(f"## Links ({len(links)})")
+        for text, url in links:
+            lines.append(f"- [{text}]({url})")
+    return "\n\n".join(lines).strip()
+def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
+    url: str,
+    verbosity: str = "Standard",
+    include_metadata: bool = True,
+    include_text: bool = True,
+    include_links: bool = True,
+    max_chars: int = 3000,
+    max_links: int = 20,
+) -> str:
+    """
+    Fetch a web page and return a compact Markdown summary that includes title, key
+    metadata, readable main text, and outbound links.
+    (Layman's terms: summarize a page with clean text + useful details.)
+    """
+    if not url or not url.strip():
+        return "Please enter a valid URL."
+    try:
+        resp = _http_get(url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        return f"An error occurred: {e}"
+    final_url = str(resp.url)
+    ctype = resp.headers.get("Content-Type", "")
+    if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
+    # Decode to text
+    resp.encoding = resp.encoding or resp.apparent_encoding
+    html = resp.text
+    # Full-page soup for metadata
+    full_soup = BeautifulSoup(html, "lxml")
+    meta = _extract_metadata(full_soup, final_url)
+    # Readable content
+    body_text, readable_soup = _extract_main_text(html)
+    if not body_text:
+        # Fallback to "whole-page text" if Readability found nothing
+        fallback_text = full_soup.get_text(" ", strip=True)
+        body_text = _normalize_whitespace(fallback_text)
+    # Verbosity presets (we keep the smaller of preset vs. user cap)
+    preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
+    target_cap = preset_caps.get(verbosity, 3000)
+    cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
+    body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
+    # Extract links from the simplified content only
+    links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
+    # Final compact Markdown
+    md = _format_markdown(
+        meta=meta,
+        body=body_text,
+        body_truncated=truncated,
+        links=links,
+        include_text=include_text,
+        include_metadata=include_metadata,
+        include_links=include_links,
+        verbosity=verbosity,
+    )
+    return md or "No content could be extracted."
+# ==========================
+# Websearch: DuckDuckGo tool
+# ==========================
+def Search_Structured(  # <-- MCP tool #3 (Structured DDG)
+    input_query: str,
+    max_results: int = 5,
+) -> List[Dict[Literal["snippet", "title", "link"], str]]:
+    """
+    Run a DuckDuckGo search and return structured results as a list of dictionaries.
+    (Layman's terms: search DDG and get clean JSON objects.)
+    """
+    if not input_query or not input_query.strip():
+        return []
+    # Create the search tool (LangChain community wrapper)
+    search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)
+    # Run the search and return results as a list of dicts
+    results = search.invoke(input_query)
+    return results
+# ========================================
+# Unstructured DDG: raw list into Textbox
+# ========================================
+def Search_Raw(  # <-- MCP tool #4 (Unstructured DDG)
+    query: str,
+) -> list[dict]:
+    """
+    Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
+    raw Python list of dictionaries from the library.
+    (Layman's terms: search DDG and show exactly what the library returns.)
+    """
+    if not query or not query.strip():
+        return []
+    with DDGS() as ddgs:
+        results = ddgs.text(query, max_results=5)
+    return results
+# ============================================
+# Concise DDG: ultra-succinct JSONL for tokens
+# ============================================
+def Search_Concise(  # <-- MCP tool #2 (Concise DDG)
+    query: str,
+    max_results: int = 5,
+    include_snippets: bool = False,
+    max_snippet_chars: int = 80,
+    dedupe_domains: bool = True,
+    title_chars: int = 80,
+) -> str:
+    """
+    Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
+    minimize tokens.
+    (Layman's terms: the tiniest useful search output possible.)
+    """
+    if not query or not query.strip():
+        return ""
+    try:
+        with DDGS() as ddgs:
+            raw = ddgs.text(query, max_results=max_results)
+    except Exception as e:
+        return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
+    seen_domains = set()
+    lines: List[str] = []
+    for r in raw or []:
+        title = _shorten((r.get("title") or "").strip(), title_chars)
+        url = (r.get("href") or r.get("link") or "").strip()
+        body = (r.get("body") or r.get("snippet") or "").strip()
+        if not url:
+            continue
+        if dedupe_domains:
+            dom = _domain_of(url)
+            if dom in seen_domains:
+                continue
+            seen_domains.add(dom)
+        obj = {"t": title or _domain_of(url), "u": url}
+        if include_snippets and body:
+            obj["s"] = _shorten(body, max_snippet_chars)
+        # Emit most compact JSON possible (no spaces)
+        lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
+    # Join as JSONL (each result on its own line)
+    return "\n".join(lines)
+# ============================================
+# Generate Sitemap (new MCP tool #5)
+# ============================================
+def Generate_Sitemap(
+    url: str,
+    max_links_per_domain: int = 0,
+) -> str:
+    """
+    Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
+    per-domain cap.
+    Args:
+        url (str): The starting page URL (http/https). If the scheme is omitted,
+            https is assumed.
+        max_links_per_domain (int): Limit the number of links shown per domain.
+            Use 0 to show all links.
+    Returns:
+        str: Markdown text containing grouped links under "Internal Links" and
+        per-domain "External Links (domain)" sections. If an error occurs or no
+        links are found, a short message is returned.
+    """
+    # --- Basic validation & normalization ---
+    if not url or not url.strip():
+        return "Please enter a valid URL."
+    # If the user forgot the scheme, assume https
+    if not url.lower().startswith(("http://", "https://")):
+        url = "https://" + url.strip()
+    # --- Fetch the page safely ---
+    try:
+        resp = _http_get(url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        return f"Error fetching URL: {str(e)}"
+    base_url = str(resp.url)  # follow redirects and use the final URL
+    content_type = resp.headers.get("Content-Type", "")
+    if "html" not in content_type.lower():
+        return "The provided URL does not appear to be an HTML page."
+    # --- Parse and collect links ---
+    soup = BeautifulSoup(resp.content, "lxml")  # fast, lenient HTML parsing
+    anchors = soup.find_all("a", href=True)
+    seen_urls: set[str] = set()
+    items: List[Dict[str, str]] = []
+    for a in anchors:
+        href = (a.get("href") or "").strip()
+        if not href:
+            continue
+        # Skip non-navigational/unsupported schemes
+        if href.startswith(("#", "javascript:", "mailto:", "tel:")):
+            continue
+        # Resolve relative links and strip fragments
+        absolute = urljoin(base_url, href)
+        absolute, _ = urldefrag(absolute)
+        # Deduplicate and skip self
+        if absolute in seen_urls or absolute == base_url:
+            continue
+        seen_urls.add(absolute)
+        # Use link text if available; otherwise the URL itself
+        text = (a.get_text(" ", strip=True) or href).strip()
+        if len(text) > 100:
+            text = text[:100] + "..."
+        items.append({"text": text, "url": absolute})
+    if not items:
+        return "No links found on this page."
+    # --- Group by Internal vs External domains ---
+    base_netloc = urlparse(base_url).netloc
+    domain_groups: Dict[str, List[Dict[str, str]]] = {}
+    for it in items:
+        netloc = urlparse(it["url"]).netloc
+        key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
+        domain_groups.setdefault(key, []).append(it)
+    # --- Build Markdown with optional per-domain limit ---
+    total_links = len(items)
+    md_lines: List[str] = []
+    md_lines.append("# Sitemap")
+    md_lines.append(f"Base URL: {base_url}")
+    md_lines.append(f"Found {total_links} links:\n")
+    # Show Internal first, then external groups sorted by name
+    keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
+    for group_key in keys_sorted:
+        if group_key not in domain_groups:
+            continue
+        group_links = domain_groups[group_key]
+        md_lines.append(f"## {group_key}\n")
+        if max_links_per_domain and max_links_per_domain > 0:
+            links_to_show = group_links[:max_links_per_domain]
+            remaining = max(0, len(group_links) - max_links_per_domain)
+        else:
+            links_to_show = group_links
+            remaining = 0
+        for link in links_to_show:
+            md_lines.append(f"- [{link['text']}]({link['url']})")
+        if remaining > 0:
+            md_lines.append(f"- ... and {remaining} more links")
+        md_lines.append("")  # blank line after each group
+    sitemap_md = "\n".join(md_lines).strip()
+    return sitemap_md
+# ======================================
+# Code Execution: Python (MCP tool #6)
+# ======================================
+def Execute_Python(code: str) -> str:
+    """
+    Execute Python code and return the stdout or error message.
+    Mirrors the standalone code interpreter behavior.
+    """
+    if code is None:
+        return "No code provided."
+    old_stdout = sys.stdout
+    redirected_output = sys.stdout = StringIO()
+    try:
+        exec(code)
+        return redirected_output.getvalue()
+    except Exception as e:
+        return str(e)
+    finally:
+        sys.stdout = old_stdout
+# ======================
+# UI: six-tab interface
+# ======================
+# --- Fetch tab (compact controllable extraction) ---
+fetch_interface = gr.Interface(
+    fn=Fetch_Webpage,  # connect the function to the UI
+    inputs=[
+        gr.Textbox(label="URL", placeholder="https://example.com/article"),
+        gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
+        gr.Checkbox(value=True, label="Include Metadata"),
+        gr.Checkbox(value=True, label="Include Main Text"),
+        gr.Checkbox(value=True, label="Include Links"),
+        gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
+        gr.Slider(0, 100, value=20, step=1, label="Max Links"),
+    ],
+    outputs=gr.Markdown(label="Extracted Summary"),
+    title="Fetch Webpage",
+    description="Extract title, key metadata, readable text, and links. No noisy HTML.",
+    api_description=(
+        "Fetch a web page and return a compact Markdown summary with title, key "
+        "metadata, readable body text, and outbound links. Parameters let you "
+        "control verbosity, whether to include metadata/text/links, and limits "
+        "for characters and number of links."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
+# --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
+concise_interface = gr.Interface(
+    fn=Search_Concise,
+    inputs=[
+        gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
+        gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
+        gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
+        gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
+        gr.Checkbox(value=True, label="Dedupe by domain"),
+        gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
+    ],
+    outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
+    title="DuckDuckGo Search (Concise)",
+    description="Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.",
+    api_description=(
+        "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
+        "t=title, u=url, optional s=snippet. Options control result count, "
+        "snippet inclusion and length, domain deduping, and title length."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+    submit_btn="Search",
+)
+# --- Websearch tab (structured DDG via LangChain) ---
+websearch_interface = gr.Interface(
+    fn=Search_Structured,  # connect the function to the UI
+    inputs=[
+        gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
+        gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
+    ],
+    outputs=gr.JSON(label="Search results"),
+    title="DuckDuckGo Search (Structured)",
+    description="Search the web using DuckDuckGo; returns snippet, title, and link.",
+    api_description=(
+        "Run a DuckDuckGo web search and return a list of objects with keys: "
+        "snippet, title, and link. Configure the number of results."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
+# --- Unstructured DDG tab (matches your separate app’s output) ---
+unstructured_interface = gr.Interface(
+    fn=Search_Raw,
+    inputs=gr.Textbox(label="Enter Search Query"),
+    outputs=gr.Textbox(label="Results", interactive=False),
+    title="DuckDuckGo Search (Raw)",
+    description="Returns the raw list of results (list[dict]) shown as text.",
+    api_description=(
+        "Run DuckDuckGo via the native client and return the raw list[dict] as "
+        "provided by duckduckgo_search (fields like title, href/link, body/snippet)."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+    submit_btn="Search",
+)
+# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
+sitemap_interface = gr.Interface(
+    fn=Generate_Sitemap,
+    inputs=[
+        gr.Textbox(
+            label="Website URL",
+            placeholder="https://example.com or example.com"
+        ),
+        gr.Slider(
+            minimum=0,
+            maximum=1000,
+            value=0,
+            step=1,
+            label="Max links per domain (0 = show all)"
+        ),
+    ],
+    outputs=gr.Markdown(label="Sitemap (Markdown)"),
+    title="Generate Sitemap",
+    description="Group links by Internal/External domains; optionally limit links per domain.",
+    api_description=(
+        "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
+        "Internal or External (per domain). Set a per-domain cap; 0 shows all."
+    ),
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+    submit_btn="Generate",
+)
+# --- Execute Python tab (simple code interpreter) ---
+code_interface = gr.Interface(
+    fn=Execute_Python,
+    inputs=gr.Code(label="Python Code", language="python"),
+    outputs=gr.Textbox(label="Output"),
+    title="Python Code Executor",
+    description="Execute Python code and see the output. This app is also an MCP server for LLMs.",
+    allow_flagging="never",
+    theme="Nymbo/Nymbo_Theme",
+)
+# --- Combine all into a single app with tabs ---
+demo = gr.TabbedInterface(
+    interface_list=[fetch_interface, concise_interface, websearch_interface, unstructured_interface, sitemap_interface, code_interface],
+    tab_names=[
+        "Fetch Webpage",
+        "DuckDuckGo Search (Concise)",
+        "DuckDuckGo Search (Structured)",
+        "DuckDuckGo Search (Raw)",
+        "Generate Sitemap",
+        "Python Code Executor",
+    ],
+    title="Web MCP — Fetch, Search, Sitemaps, and Code Execution.",
+    theme="Nymbo/Nymbo_Theme",
+)
+# Launch the UI and expose all functions as MCP tools in one server
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)