Nymbo commited on
Commit
a9decab
·
verified ·
1 Parent(s): 46b04f4

adding new HTML fetching mode

Browse files
Files changed (1) hide show
  1. Modules/Web_Fetch.py +65 -35
Modules/Web_Fetch.py CHANGED
@@ -1,7 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import re
4
- from typing import Annotated, Dict, List, Tuple
5
  from urllib.parse import urlparse, urljoin
6
 
7
  import gradio as gr
@@ -16,10 +16,12 @@ from ._docstrings import autodoc
16
 
17
  # Single source of truth for the LLM-facing tool description
18
  TOOL_SUMMARY = (
19
- "Fetch a webpage and return clean Markdown or a list of links, with max length and pagination via offset; "
20
- "if truncated, the output includes a notice with next_cursor for exact continuation."
21
  )
22
 
 
 
23
 
24
  def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
25
  headers = {
@@ -82,6 +84,21 @@ def _domain_of(url: str) -> str:
82
  return ""
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
86
  links = []
87
  for link in soup.find_all("a", href=True):
@@ -134,16 +151,16 @@ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_
134
  return markdown_text or "No content could be extracted."
135
 
136
 
137
- def _truncate_markdown(markdown: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
138
- total_chars = len(markdown)
139
  if total_chars <= max_chars:
140
- return markdown, {
141
  "truncated": False,
142
  "returned_chars": total_chars,
143
  "total_chars_estimate": total_chars,
144
  "next_cursor": None,
145
  }
146
- truncated = markdown[:max_chars]
147
  last_paragraph = truncated.rfind("\n\n")
148
  if last_paragraph > max_chars * 0.7:
149
  truncated = truncated[:last_paragraph]
@@ -180,14 +197,18 @@ def Web_Fetch(
180
  max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
181
  offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
182
  strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
183
- url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
 
 
 
184
  ) -> str:
 
185
  _log_call_start(
186
  "Web_Fetch",
187
  url=url,
188
  max_chars=max_chars,
189
  strip_selectors=strip_selectors,
190
- url_scraper=url_scraper,
191
  offset=offset,
192
  )
193
  if not url or not url.strip():
@@ -210,31 +231,35 @@ def Web_Fetch(
210
  resp.encoding = resp.encoding or resp.apparent_encoding
211
  html = resp.text
212
  full_soup = BeautifulSoup(html, "lxml")
213
- if url_scraper:
214
- result = _extract_links_from_soup(full_soup, final_url)
215
- if offset > 0:
216
- result = result[offset:]
217
- if max_chars > 0 and len(result) > max_chars:
218
- result, _ = _truncate_markdown(result, max_chars)
219
- else:
220
  full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  if offset > 0:
222
- if offset >= len(full_result):
223
- result = (
224
- f"Offset {offset} exceeds content length ({len(full_result)} characters). "
225
- f"Content ends at position {len(full_result)}."
226
- )
227
- _log_call_end("Web_Fetch", _truncate_for_log(result))
228
- return result
229
- result = full_result[offset:]
230
- else:
231
- result = full_result
232
- if max_chars > 0 and len(result) > max_chars:
233
- result, metadata = _truncate_markdown(result, max_chars)
234
- if offset > 0:
235
- metadata["total_chars_estimate"] = len(full_result)
236
- metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
237
- _log_call_end("Web_Fetch", f"chars={len(result)}, url_scraper={url_scraper}, offset={offset}")
238
  return result
239
 
240
 
@@ -259,13 +284,18 @@ def build_interface() -> gr.Interface:
259
  max_lines=1,
260
  info="CSS selectors to remove (comma-separated)",
261
  ),
262
- gr.Checkbox(label="URL Scraper", value=False, info="Extract only links instead of content"),
 
 
 
 
 
263
  ],
264
  outputs=gr.Markdown(label="Extracted Content"),
265
  title="Web Fetch",
266
  description=(
267
- "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, "
268
- "or extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
269
  ),
270
  api_description=TOOL_SUMMARY,
271
  flagging_mode="never",
 
1
  from __future__ import annotations
2
 
3
  import re
4
+ from typing import Annotated, Dict, Literal, Tuple
5
  from urllib.parse import urlparse, urljoin
6
 
7
  import gradio as gr
 
16
 
17
  # Single source of truth for the LLM-facing tool description
18
  TOOL_SUMMARY = (
19
+ "Fetch a webpage and return clean Markdown, raw HTML, or a list of links, with max length and pagination via "
20
+ "offset; if truncated, the output includes a notice with next_cursor for exact continuation."
21
  )
22
 
23
+ ModeOption = Literal["markdown", "html", "url_scraper"]
24
+
25
 
26
  def _http_get_enhanced(url: str, timeout: int | float = 30, *, skip_rate_limit: bool = False) -> requests.Response:
27
  headers = {
 
84
  return ""
85
 
86
 
87
+ def _normalize_mode(mode: str | None) -> ModeOption:
88
+ """Convert UI-supplied labels into canonical mode values."""
89
+ if not mode:
90
+ return "markdown"
91
+ normalized = mode.strip().lower()
92
+ normalized = normalized.replace("-", "_").replace(" ", "_")
93
+ if normalized in {"markdown", "markdown_mode", "md"}:
94
+ return "markdown"
95
+ if normalized in {"html", "html_mode"}:
96
+ return "html"
97
+ if normalized in {"url_scraper", "urlscraper", "url_mode", "scraper", "links", "link_mode"}:
98
+ return "url_scraper"
99
+ return "markdown"
100
+
101
+
102
  def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
103
  links = []
104
  for link in soup.find_all("a", href=True):
 
151
  return markdown_text or "No content could be extracted."
152
 
153
 
154
+ def _truncate_with_notice(content: str, max_chars: int) -> Tuple[str, Dict[str, object]]:
155
+ total_chars = len(content)
156
  if total_chars <= max_chars:
157
+ return content, {
158
  "truncated": False,
159
  "returned_chars": total_chars,
160
  "total_chars_estimate": total_chars,
161
  "next_cursor": None,
162
  }
163
+ truncated = content[:max_chars]
164
  last_paragraph = truncated.rfind("\n\n")
165
  if last_paragraph > max_chars * 0.7:
166
  truncated = truncated[:last_paragraph]
 
197
  max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
198
  offset: Annotated[int, "Character offset to start from (for pagination, use next_cursor from previous call)."] = 0,
199
  strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
200
+ mode: Annotated[
201
+ str,
202
+ "Output mode: 'markdown' (default, clean content), 'html' (raw response), or 'url_scraper' (links list).",
203
+ ] = "markdown",
204
  ) -> str:
205
+ canonical_mode = _normalize_mode(mode)
206
  _log_call_start(
207
  "Web_Fetch",
208
  url=url,
209
  max_chars=max_chars,
210
  strip_selectors=strip_selectors,
211
+ mode=canonical_mode,
212
  offset=offset,
213
  )
214
  if not url or not url.strip():
 
231
  resp.encoding = resp.encoding or resp.apparent_encoding
232
  html = resp.text
233
  full_soup = BeautifulSoup(html, "lxml")
234
+ if canonical_mode == "html":
235
+ _log_call_end("Web_Fetch", f"chars={len(html)}, mode={canonical_mode}, offset=0 (ignored)")
236
+ return html
237
+ if canonical_mode == "markdown":
 
 
 
238
  full_result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
239
+ elif canonical_mode == "url_scraper":
240
+ full_result = _extract_links_from_soup(full_soup, final_url)
241
+ else:
242
+ full_result = html
243
+
244
+ if offset > 0:
245
+ if offset >= len(full_result):
246
+ result = (
247
+ f"Offset {offset} exceeds content length ({len(full_result)} characters). "
248
+ f"Content ends at position {len(full_result)}."
249
+ )
250
+ _log_call_end("Web_Fetch", _truncate_for_log(result))
251
+ return result
252
+ result = full_result[offset:]
253
+ else:
254
+ result = full_result
255
+
256
+ if max_chars > 0 and len(result) > max_chars:
257
+ result, metadata = _truncate_with_notice(result, max_chars)
258
  if offset > 0:
259
+ metadata["total_chars_estimate"] = len(full_result)
260
+ metadata["next_cursor"] = offset + metadata["next_cursor"] if metadata["next_cursor"] else None
261
+
262
+ _log_call_end("Web_Fetch", f"chars={len(result)}, mode={canonical_mode}, offset={offset}")
 
 
 
 
 
 
 
 
 
 
 
 
263
  return result
264
 
265
 
 
284
  max_lines=1,
285
  info="CSS selectors to remove (comma-separated)",
286
  ),
287
+ gr.Radio(
288
+ label="Mode",
289
+ choices=["Markdown Mode", "HTML Mode", "URL Scraper"],
290
+ value="Markdown Mode",
291
+ info="Markdown cleans content, HTML returns raw response, URL Scraper lists links.",
292
+ ),
293
  ],
294
  outputs=gr.Markdown(label="Extracted Content"),
295
  title="Web Fetch",
296
  description=(
297
+ "<div style=\"text-align:center\">Convert any webpage to Markdown, inspect the raw HTML response, or "
298
+ "extract all links. Supports custom element removal, length limits, and pagination with offset.</div>"
299
  ),
300
  api_description=TOOL_SUMMARY,
301
  flagging_mode="never",