adding markdown scraper option to Fetch_Webpage tool
Browse files
app.py
CHANGED
|
@@ -19,6 +19,7 @@ from typing import List, Dict, Tuple, Annotated
|
|
| 19 |
import gradio as gr
|
| 20 |
import requests
|
| 21 |
from bs4 import BeautifulSoup
|
|
|
|
| 22 |
from readability import Document
|
| 23 |
from urllib.parse import urljoin, urldefrag, urlparse
|
| 24 |
from duckduckgo_search import DDGS
|
|
@@ -175,6 +176,50 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
| 175 |
return clean_text, s
|
| 176 |
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
| 179 |
"""
|
| 180 |
Collect clean, unique, absolute links from the readable section only.
|
|
@@ -277,6 +322,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 277 |
include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
|
| 278 |
max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
|
| 279 |
max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
|
|
|
|
| 280 |
) -> str:
|
| 281 |
"""
|
| 282 |
Fetch a web page and return a compact Markdown summary containing title, key
|
|
@@ -297,6 +343,10 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 297 |
- Metadata (optional)
|
| 298 |
- Text (optional, may be trimmed)
|
| 299 |
- Links (optional, deduped and absolute)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
"""
|
| 301 |
if not url or not url.strip():
|
| 302 |
return "Please enter a valid URL."
|
|
@@ -316,10 +366,14 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
| 316 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
| 317 |
html = resp.text
|
| 318 |
|
| 319 |
-
# Full-page soup for metadata
|
| 320 |
full_soup = BeautifulSoup(html, "lxml")
|
| 321 |
meta = _extract_metadata(full_soup, final_url)
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# Readable content
|
| 324 |
body_text, readable_soup = _extract_main_text(html)
|
| 325 |
if not body_text:
|
|
@@ -679,17 +733,18 @@ fetch_interface = gr.Interface(
|
|
| 679 |
gr.Checkbox(value=True, label="Include Links"),
|
| 680 |
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
|
| 681 |
gr.Slider(0, 100, value=20, step=1, label="Max Links"),
|
|
|
|
| 682 |
],
|
| 683 |
outputs=gr.Markdown(label="Extracted Summary"),
|
| 684 |
title="Fetch Webpage",
|
| 685 |
description=(
|
| 686 |
-
|
| 687 |
),
|
| 688 |
api_description=(
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
),
|
| 694 |
allow_flagging="never",
|
| 695 |
)
|
|
|
|
| 19 |
import gradio as gr
|
| 20 |
import requests
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
+
from markdownify import markdownify as md
|
| 23 |
from readability import Document
|
| 24 |
from urllib.parse import urljoin, urldefrag, urlparse
|
| 25 |
from duckduckgo_search import DDGS
|
|
|
|
| 176 |
return clean_text, s
|
| 177 |
|
| 178 |
|
| 179 |
+
def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
|
| 180 |
+
"""
|
| 181 |
+
Convert the page's main content (or body fallback) to Markdown, similar to
|
| 182 |
+
web-scraper's Content Scraper tool, but without any file download side-effects.
|
| 183 |
+
|
| 184 |
+
Steps:
|
| 185 |
+
- Remove noisy elements (script/style/nav/footer/header/aside)
|
| 186 |
+
- Prefer <main>, <article>, or common content containers; fallback to <body>
|
| 187 |
+
- Convert to Markdown with ATX headings
|
| 188 |
+
- Clean up excessive newlines, empty links, and whitespace
|
| 189 |
+
- Prepend a title header when available
|
| 190 |
+
"""
|
| 191 |
+
# Remove unwanted elements globally first
|
| 192 |
+
for element in full_soup.select("script, style, nav, footer, header, aside"):
|
| 193 |
+
element.decompose()
|
| 194 |
+
|
| 195 |
+
# Try common main-content containers, then fallback to body
|
| 196 |
+
main = (
|
| 197 |
+
full_soup.find("main")
|
| 198 |
+
or full_soup.find("article")
|
| 199 |
+
or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
|
| 200 |
+
or full_soup.find("body")
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
if not main:
|
| 204 |
+
return "No main content found on the webpage."
|
| 205 |
+
|
| 206 |
+
# Convert selected HTML to Markdown
|
| 207 |
+
markdown_text = md(str(main), heading_style="ATX")
|
| 208 |
+
|
| 209 |
+
# Clean up the markdown similar to web-scraper
|
| 210 |
+
markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
|
| 211 |
+
markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text) # empty links
|
| 212 |
+
markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
|
| 213 |
+
markdown_text = markdown_text.strip()
|
| 214 |
+
|
| 215 |
+
# Add title if present
|
| 216 |
+
title = full_soup.find("title")
|
| 217 |
+
if title and title.get_text(strip=True):
|
| 218 |
+
markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
|
| 219 |
+
|
| 220 |
+
return markdown_text or "No content could be extracted."
|
| 221 |
+
|
| 222 |
+
|
| 223 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
| 224 |
"""
|
| 225 |
Collect clean, unique, absolute links from the readable section only.
|
|
|
|
| 322 |
include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
|
| 323 |
max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
|
| 324 |
max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
|
| 325 |
+
full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
|
| 326 |
) -> str:
|
| 327 |
"""
|
| 328 |
Fetch a web page and return a compact Markdown summary containing title, key
|
|
|
|
| 343 |
- Metadata (optional)
|
| 344 |
- Text (optional, may be trimmed)
|
| 345 |
- Links (optional, deduped and absolute)
|
| 346 |
+
|
| 347 |
+
Special mode:
|
| 348 |
+
If full_page_markdown=True, the function returns the page converted to Markdown,
|
| 349 |
+
similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
|
| 350 |
"""
|
| 351 |
if not url or not url.strip():
|
| 352 |
return "Please enter a valid URL."
|
|
|
|
| 366 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
| 367 |
html = resp.text
|
| 368 |
|
| 369 |
+
# Full-page soup for metadata (and potential Markdown conversion)
|
| 370 |
full_soup = BeautifulSoup(html, "lxml")
|
| 371 |
meta = _extract_metadata(full_soup, final_url)
|
| 372 |
|
| 373 |
+
# Content Scraper mode: return full-page Markdown early
|
| 374 |
+
if full_page_markdown:
|
| 375 |
+
return _fullpage_markdown_from_soup(full_soup, final_url)
|
| 376 |
+
|
| 377 |
# Readable content
|
| 378 |
body_text, readable_soup = _extract_main_text(html)
|
| 379 |
if not body_text:
|
|
|
|
| 733 |
gr.Checkbox(value=True, label="Include Links"),
|
| 734 |
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
|
| 735 |
gr.Slider(0, 100, value=20, step=1, label="Max Links"),
|
| 736 |
+
gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
|
| 737 |
],
|
| 738 |
outputs=gr.Markdown(label="Extracted Summary"),
|
| 739 |
title="Fetch Webpage",
|
| 740 |
description=(
|
| 741 |
+
"<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages — or toggle full-page Markdown.</div>"
|
| 742 |
),
|
| 743 |
api_description=(
|
| 744 |
+
"Fetch a web page and return a compact Markdown summary with title, key "
|
| 745 |
+
"metadata, readable body text, and outbound links. Or, enable the "
|
| 746 |
+
"'Full-page Markdown (Content Scraper mode)' option to return the page "
|
| 747 |
+
"converted to Markdown."
|
| 748 |
),
|
| 749 |
allow_flagging="never",
|
| 750 |
)
|