Nymbo commited on
Commit
ed7ddca
·
verified ·
1 Parent(s): ee1c18d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -32
app.py CHANGED
@@ -41,20 +41,138 @@ except Exception: # pragma: no cover - optional dependency
41
 
42
 
43
  # ==============================
44
- # Fetch: HTTP + extraction utils
45
  # ==============================
46
 
47
- def _http_get(url: str) -> requests.Response:
48
  """
49
- Download the page politely with a short timeout and realistic headers.
50
- (Layman's terms: grab the web page like a normal browser would, but quickly.)
51
  """
52
  headers = {
53
- "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
54
  "Accept-Language": "en-US,en;q=0.9",
55
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 
 
 
 
56
  }
57
- return requests.get(url, headers=headers, timeout=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  def _normalize_whitespace(text: str) -> str:
@@ -355,7 +473,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
355
  return "Please enter a valid URL."
356
 
357
  try:
358
- resp = _http_get(url)
359
  resp.raise_for_status()
360
  except requests.exceptions.RequestException as e:
361
  return f"An error occurred: {e}"
@@ -377,8 +495,8 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
377
  if full_page_markdown:
378
  return _fullpage_markdown_from_soup(full_soup, final_url)
379
 
380
- # Readable content
381
- body_text, readable_soup = _extract_main_text(html)
382
  if not body_text:
383
  # Fallback to "whole-page text" if Readability found nothing
384
  fallback_text = full_soup.get_text(" ", strip=True)
@@ -408,9 +526,37 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
408
 
409
 
410
  # ============================================
411
- # DuckDuckGo Search: ultra-succinct JSONL
412
  # ============================================
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
415
  query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."] ,
416
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
@@ -418,10 +564,12 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
418
  max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
419
  dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
420
  title_chars: Annotated[int, "Character cap applied to titles."] = 80,
 
421
  ) -> str:
422
  """
423
- Run a DuckDuckGo search and return ultra-compact JSONL with short keys to
424
- minimize tokens.
 
425
 
426
  Args:
427
  query: The search query (supports operators like site:, quotes, OR).
@@ -430,22 +578,50 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
430
  max_snippet_chars: Character cap applied to each snippet when included.
431
  dedupe_domains: If true, only the first result from each domain is kept.
432
  title_chars: Character cap applied to titles.
 
433
 
434
  Returns:
435
- str: Newline-delimited JSON (JSONL). Each line has:
436
- {"t": "title", "u": "url"[, "s": "snippet"]}
437
  """
438
  if not query or not query.strip():
439
- return ""
 
 
 
440
 
 
 
 
441
  try:
 
 
 
 
442
  with DDGS() as ddgs:
443
  raw = ddgs.text(query, max_results=max_results)
 
444
  except Exception as e:
445
- return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
  seen_domains = set()
448
- lines: List[str] = []
449
 
450
  for r in raw or []:
451
  title = _shorten((r.get("title") or "").strip(), title_chars)
@@ -461,16 +637,41 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
461
  continue
462
  seen_domains.add(dom)
463
 
464
- obj = {"t": title or _domain_of(url), "u": url}
465
-
 
 
 
466
  if include_snippets and body:
467
- obj["s"] = _shorten(body, max_snippet_chars)
468
-
469
- # Emit most compact JSON possible (no spaces)
470
- lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
471
-
472
- # Join as JSONL (each result on its own line)
473
- return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
 
476
  # ======================================
@@ -762,16 +963,17 @@ concise_interface = gr.Interface(
762
  gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
763
  gr.Checkbox(value=True, label="Dedupe by domain"),
764
  gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
 
765
  ],
766
- outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
767
  title="DuckDuckGo Search",
768
  description=(
769
- "<div style=\"text-align:center\">Very concise web search to avoid unnecessary context. Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.</div>"
770
  ),
771
  api_description=(
772
- "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
773
- "t=title, u=url, optional s=snippet. Options control result count, "
774
- "snippet inclusion and length, domain deduping, and title length."
775
  ),
776
  allow_flagging="never",
777
  submit_btn="Search",
 
41
 
42
 
43
  # ==============================
44
+ # Fetch: Enhanced HTTP + extraction utils
45
  # ==============================
46
 
47
+ def _http_get_enhanced(url: str) -> requests.Response:
48
  """
49
+ Download the page with enhanced headers, timeout handling, and better error recovery.
 
50
  """
51
  headers = {
52
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
53
  "Accept-Language": "en-US,en;q=0.9",
54
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
55
+ "Accept-Encoding": "gzip, deflate, br",
56
+ "DNT": "1",
57
+ "Connection": "keep-alive",
58
+ "Upgrade-Insecure-Requests": "1",
59
  }
60
+
61
+ # Apply rate limiting
62
+ _fetch_rate_limiter.acquire()
63
+
64
+ try:
65
+ response = requests.get(
66
+ url,
67
+ headers=headers,
68
+ timeout=30, # Increased timeout
69
+ allow_redirects=True,
70
+ stream=False
71
+ )
72
+ response.raise_for_status()
73
+ return response
74
+ except requests.exceptions.Timeout:
75
+ raise requests.exceptions.RequestException("Request timed out. The webpage took too long to respond.")
76
+ except requests.exceptions.ConnectionError:
77
+ raise requests.exceptions.RequestException("Connection error. Please check the URL and your internet connection.")
78
+ except requests.exceptions.HTTPError as e:
79
+ if response.status_code == 403:
80
+ raise requests.exceptions.RequestException("Access forbidden. The website may be blocking automated requests.")
81
+ elif response.status_code == 404:
82
+ raise requests.exceptions.RequestException("Page not found. Please check the URL.")
83
+ elif response.status_code == 429:
84
+ raise requests.exceptions.RequestException("Rate limited. Please try again in a few minutes.")
85
+ else:
86
+ raise requests.exceptions.RequestException(f"HTTP error {response.status_code}: {str(e)}")
87
+
88
+ def _extract_main_text_enhanced(html: str) -> Tuple[str, BeautifulSoup]:
89
+ """
90
+ Enhanced main text extraction with better fallback mechanisms.
91
+ """
92
+ try:
93
+ # Try Readability first
94
+ doc = Document(html)
95
+ readable_html = doc.summary(html_partial=True)
96
+
97
+ if readable_html and readable_html.strip():
98
+ soup = BeautifulSoup(readable_html, "lxml")
99
+
100
+ # Remove noisy tags more comprehensively
101
+ for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside", "[role='banner']", "[role='navigation']", "[role='complementary']"]:
102
+ for tag in soup.select(sel):
103
+ tag.decompose()
104
+
105
+ # Extract text with better structure preservation
106
+ text_parts = []
107
+ for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "div"]):
108
+ chunk = element.get_text(" ", strip=True)
109
+ if chunk and len(chunk) > 15: # Only include substantial content
110
+ # Avoid repetitive disclaimers
111
+ if not ("responses are generated using ai" in chunk.lower() and len(chunk) < 100):
112
+ text_parts.append(chunk)
113
+
114
+ if text_parts:
115
+ clean_text = _normalize_whitespace("\n\n".join(text_parts))
116
+ # Check if we got substantial content
117
+ if len(clean_text) > 100:
118
+ return clean_text, soup
119
+
120
+ except Exception:
121
+ pass # Fall through to backup extraction
122
+
123
+ # Fallback: Parse original HTML more carefully
124
+ try:
125
+ full_soup = BeautifulSoup(html, "lxml")
126
+
127
+ # Remove unwanted elements
128
+ for element in full_soup.select("script, style, nav, footer, header, aside, [role='banner'], [role='navigation'], [role='complementary']"):
129
+ element.decompose()
130
+
131
+ # Try to find main content areas
132
+ main_content = (
133
+ full_soup.find("main")
134
+ or full_soup.find("article")
135
+ or full_soup.find("div", class_=re.compile(r"content|main|post|article|body", re.I))
136
+ or full_soup.find("div", id=re.compile(r"content|main|post|article|body", re.I))
137
+ or full_soup.find("section", class_=re.compile(r"content|main|post|article|body", re.I))
138
+ or full_soup.find("body")
139
+ or full_soup
140
+ )
141
+
142
+ if main_content:
143
+ # More aggressive removal of common noise patterns
144
+ for element in main_content.select(".disclaimer, .warning, .alert, .notice, [class*='cookie'], [class*='banner'], [id*='cookie'], [id*='banner']"):
145
+ element.decompose()
146
+
147
+ text = main_content.get_text(" ", strip=True)
148
+ text = _normalize_whitespace(text)
149
+
150
+ # Filter out repetitive text
151
+ lines = text.split('\n')
152
+ filtered_lines = []
153
+ seen_lines = set()
154
+
155
+ for line in lines:
156
+ line_clean = line.strip()
157
+ if len(line_clean) > 10 and line_clean not in seen_lines:
158
+ # Skip common disclaimers and repetitive content
159
+ if not ("responses are generated using ai" in line_clean.lower() and len(line_clean) < 100):
160
+ filtered_lines.append(line)
161
+ seen_lines.add(line_clean)
162
+
163
+ clean_text = '\n'.join(filtered_lines)
164
+
165
+ # Create a minimal soup for link extraction
166
+ minimal_soup = BeautifulSoup(str(main_content), "lxml")
167
+ return clean_text, minimal_soup
168
+
169
+ except Exception:
170
+ pass
171
+
172
+ # Last resort: Just get all text
173
+ fallback_soup = BeautifulSoup(html, "lxml")
174
+ text = fallback_soup.get_text(" ", strip=True)
175
+ return _normalize_whitespace(text), fallback_soup
176
 
177
 
178
  def _normalize_whitespace(text: str) -> str:
 
473
  return "Please enter a valid URL."
474
 
475
  try:
476
+ resp = _http_get_enhanced(url)
477
  resp.raise_for_status()
478
  except requests.exceptions.RequestException as e:
479
  return f"An error occurred: {e}"
 
495
  if full_page_markdown:
496
  return _fullpage_markdown_from_soup(full_soup, final_url)
497
 
498
+ # Readable content with enhanced extraction
499
+ body_text, readable_soup = _extract_main_text_enhanced(html)
500
  if not body_text:
501
  # Fallback to "whole-page text" if Readability found nothing
502
  fallback_text = full_soup.get_text(" ", strip=True)
 
526
 
527
 
528
  # ============================================
529
+ # DuckDuckGo Search: Enhanced with error handling & rate limiting
530
  # ============================================
531
 
532
+ import asyncio
533
+ from datetime import datetime, timedelta
534
+
535
+ class RateLimiter:
536
+ def __init__(self, requests_per_minute: int = 30):
537
+ self.requests_per_minute = requests_per_minute
538
+ self.requests = []
539
+
540
+ def acquire(self):
541
+ """Synchronous rate limiting for non-async context"""
542
+ now = datetime.now()
543
+ # Remove requests older than 1 minute
544
+ self.requests = [
545
+ req for req in self.requests if now - req < timedelta(minutes=1)
546
+ ]
547
+
548
+ if len(self.requests) >= self.requests_per_minute:
549
+ # Wait until we can make another request
550
+ wait_time = 60 - (now - self.requests[0]).total_seconds()
551
+ if wait_time > 0:
552
+ time.sleep(max(1, wait_time)) # At least 1 second wait
553
+
554
+ self.requests.append(now)
555
+
556
+ # Global rate limiters
557
+ _search_rate_limiter = RateLimiter(requests_per_minute=20)
558
+ _fetch_rate_limiter = RateLimiter(requests_per_minute=25)
559
+
560
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
561
  query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."] ,
562
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 
564
  max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
565
  dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
566
  title_chars: Annotated[int, "Character cap applied to titles."] = 80,
567
+ output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
568
  ) -> str:
569
  """
570
+ Run a DuckDuckGo search with enhanced error handling and multiple output formats.
571
+ Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text
572
+ format optimized for LLM consumption with better error messages.
573
 
574
  Args:
575
  query: The search query (supports operators like site:, quotes, OR).
 
578
  max_snippet_chars: Character cap applied to each snippet when included.
579
  dedupe_domains: If true, only the first result from each domain is kept.
580
  title_chars: Character cap applied to titles.
581
+ output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
582
 
583
  Returns:
584
+ str: Either JSONL format with {"t": "title", "u": "url"[, "s": "snippet"]}
585
+ or readable text format for better LLM consumption.
586
  """
587
  if not query or not query.strip():
588
+ error_msg = "No search query provided. Please enter a search term."
589
+ if output_format == "readable":
590
+ return error_msg
591
+ return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
592
 
593
+ # Validate max_results
594
+ max_results = max(1, min(20, max_results))
595
+
596
  try:
597
+ # Apply rate limiting to avoid being blocked
598
+ _search_rate_limiter.acquire()
599
+
600
+ # Perform search with timeout handling
601
  with DDGS() as ddgs:
602
  raw = ddgs.text(query, max_results=max_results)
603
+
604
  except Exception as e:
605
+ error_msg = f"Search failed: {str(e)[:200]}"
606
+ if "blocked" in str(e).lower() or "rate" in str(e).lower():
607
+ error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
608
+ elif "timeout" in str(e).lower():
609
+ error_msg = "Search timed out. Please try again with a simpler query."
610
+ elif "network" in str(e).lower() or "connection" in str(e).lower():
611
+ error_msg = "Network connection error. Please check your internet connection and try again."
612
+
613
+ if output_format == "readable":
614
+ return f"Error: {error_msg}"
615
+ return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
616
+
617
+ if not raw:
618
+ no_results_msg = f"No results found for query: {query}"
619
+ if output_format == "readable":
620
+ return no_results_msg
621
+ return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
622
 
623
  seen_domains = set()
624
+ results = []
625
 
626
  for r in raw or []:
627
  title = _shorten((r.get("title") or "").strip(), title_chars)
 
637
  continue
638
  seen_domains.add(dom)
639
 
640
+ result_obj = {
641
+ "title": title or _domain_of(url),
642
+ "url": url
643
+ }
644
+
645
  if include_snippets and body:
646
+ result_obj["snippet"] = _shorten(body, max_snippet_chars)
647
+
648
+ results.append(result_obj)
649
+
650
+ if not results:
651
+ no_results_msg = f"No valid results found for query: {query}"
652
+ if output_format == "readable":
653
+ return no_results_msg
654
+ return json.dumps({"info": no_results_msg}, ensure_ascii=False, separators=(",", ":"))
655
+
656
+ # Format output based on requested format
657
+ if output_format == "readable":
658
+ lines = [f"Found {len(results)} search results for: {query}\n"]
659
+ for i, result in enumerate(results, 1):
660
+ lines.append(f"{i}. {result['title']}")
661
+ lines.append(f" URL: {result['url']}")
662
+ if "snippet" in result:
663
+ lines.append(f" Summary: {result['snippet']}")
664
+ lines.append("") # Empty line between results
665
+ return "\n".join(lines)
666
+ else:
667
+ # JSONL format with compact keys
668
+ lines = []
669
+ for result in results:
670
+ obj = {"t": result["title"], "u": result["url"]}
671
+ if "snippet" in result:
672
+ obj["s"] = result["snippet"]
673
+ lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
674
+ return "\n".join(lines)
675
 
676
 
677
  # ======================================
 
963
  gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
964
  gr.Checkbox(value=True, label="Dedupe by domain"),
965
  gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
966
+ gr.Radio(label="Output format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact JSON, readable for LLM-friendly text"),
967
  ],
968
+ outputs=gr.Textbox(label="Search Results", interactive=False),
969
  title="DuckDuckGo Search",
970
  description=(
971
+ "<div style=\"text-align:center\">Enhanced web search with better error handling and multiple output formats. JSONL format emits compact keys (t,u[,s]), readable format provides LLM-friendly text.</div>"
972
  ),
973
  api_description=(
974
+ "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
975
+ "Returns either compact JSONL (t=title, u=url, optional s=snippet) or readable text "
976
+ "format optimized for LLM consumption with better error messages."
977
  ),
978
  allow_flagging="never",
979
  submit_btn="Search",