Spaces:
Running
Running
File size: 12,729 Bytes
9536c67 483c169 255e074 483c169 b3c07b5 483c169 9536c67 483c169 b3c07b5 9536c67 b3c07b5 483c169 b3c07b5 9536c67 b3c07b5 255e074 483c169 9536c67 483c169 4886f5d 483c169 4886f5d 483c169 4886f5d 483c169 4886f5d 483c169 255e074 4886f5d 483c169 9cfeed9 4886f5d 9cfeed9 4886f5d 9cfeed9 483c169 4886f5d 483c169 22fa711 df26504 2e4cb4f 483c169 8eba581 df26504 cd825a9 483c169 255e074 9536c67 255e074 b3c07b5 483c169 b3c07b5 2e4cb4f 483c169 2e4cb4f 255e074 483c169 b3c07b5 483c169 b3c07b5 483c169 255e074 483c169 255e074 b3c07b5 483c169 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
"""
This module sets up and runs the Gradio web interface for the LLM Web Scraper application.
It orchestrates the UI components, event handling for scraping and LLM extraction,
and integrates with backend services for scraping (FireCrawl, Crawl4AI) and
LLM inference. It also initializes and uses Langfuse for tracing application performance.
"""
import gradio as gr
import firecrawl_client
import crawl4ai_client
import llm_inference_service
from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
from langfuse import Langfuse, get_client
# Initialize Langfuse if configured
langfuse = None
if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
Langfuse(
public_key=LANGFUSE_PUBLIC_KEY,
secret_key=LANGFUSE_SECRET_KEY,
host=LANGFUSE_HOST
)
langfuse = get_client()
def parse_model_provider(selection):
"""
Parses a model and provider from a selection string.
The expected format is "<model_name> (<provider>)".
Args:
selection (str): The string to parse.
Returns:
tuple[str, str]: A tuple containing the model name and provider.
Raises:
ValueError: If the selection string is not in the expected format.
"""
if "(" in selection and ")" in selection:
model = selection.split(" (")[0].strip()
provider = selection.split(" (")[1].replace(")", "").strip()
return model, provider
raise ValueError(f"Invalid selection format: {selection}")
def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)):
"""
A generator function that wraps the LLM inference call for the Gradio UI.
It yields an initial status message, calls the LLM service to extract information,
and then yields the final result or an error message.
Args:
query (str): The user's query for information extraction.
scrape_result (str): The scraped markdown content from the website.
model_provider_selection (str): The selected model and provider string.
progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
Yields:
str: Status messages and the final LLM response as a markdown string.
"""
yield "⏳ Generating response... Please wait."
model, provider = parse_model_provider(model_provider_selection)
result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
if not result or (isinstance(result, str) and result.strip() == ""):
yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
yield result
async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)):
"""An async generator that scrapes a website based on user selection for the Gradio UI.
This function yields an initial status message, then performs the web scraping
using the selected tool (FireCrawl or Crawl4AI). If Langfuse is configured,
it wraps the scraping operation in a trace for observability.
Args:
url (str): The URL of the website to scrape.
scraper_selection (str): The scraping tool selected by the user.
progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
Yields:
str: A status message, followed by the scraped markdown content or an error message.
"""
# 1. First, yield an update to show the loading state and hide the old image.
yield "⏳ Scraping website... Please wait."
markdown = ""
if not langfuse:
try:
if scraper_selection == "Scrape with FireCrawl":
markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
elif scraper_selection == "Scrape with Crawl4AI":
markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
else:
markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
except Exception as e:
markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
yield markdown
return
with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span:
try:
if scraper_selection == "Scrape with FireCrawl":
markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
elif scraper_selection == "Scrape with Crawl4AI":
markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
else:
markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"})
except Exception as e:
markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
span.update_trace(output={"error": str(e), "status": "Error"})
yield markdown
#Gradio UI
# This block defines the entire Gradio user interface, including layout and component interactions.
with gr.Blocks() as gradio_ui:
gr.HTML("""
<div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
<h1 style="margin: 0;"> LLM Web Scraper</h1>
<div style="display: flex; gap: 10px;">
<a href="https://www.langchain.com/" target="_blank">
<img src="https://img.shields.io/badge/LangChain-blue?style=for-the-badge&logo=langchain" alt="LangChain">
</a>
<a href="https://ai.google.dev/gemini-api/docs" target="_blank">
<img src="https://img.shields.io/badge/Gemini-white?style=for-the-badge&logo=google-gemini" alt="Gemini API">
</a>
<a href="https://build.nvidia.com/models" target="_blank">
<img src="https://img.shields.io/badge/NVIDIA-gray?style=for-the-badge&logo=nvidia" alt="NVIDIA NIM">
</a>
<a href="https://firecrawl.dev/" target="_blank">
<img src="https://img.shields.io/badge/FireCrawl-orange?style=for-the-badge&logo=fire" alt="FireCrawl">
</a>
<a href="https://docs.crawl4ai.com/" target="_blank">
<img src="https://img.shields.io/badge/Crawl4AI-blueviolet?style=for-the-badge&logo=github" alt="Crawl4AI">
</a>
<a href="https://playwright.dev/" target="_blank">
<img src="https://img.shields.io/badge/Playwright-brightgreen?style=for-the-badge&logo=playwright" alt="Playwright">
</a>
<a href="https://langfuse.com/" target="_blank">
<img src="https://img.shields.io/badge/Langfuse-blue?style=for-the-badge&logo=langfuse" alt="Langfuse">
</a>
</div>
</div>
""")
gr.HTML("""
<div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
<span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
<a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
</a>
</div>
""")
gr.HTML("""
<div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
<span style="font-size: 16px;">📖 <strong>Read the full story:</strong></span>
<a href="https://medium.com/@frkhan/from-broken-selectors-to-intelligent-scraping-a-journey-into-llm-powered-web-automation-fc76d5fe2dbc" target="_blank">
<img src="https://img.shields.io/badge/Medium-Read%20Story-black?style=for-the-badge&logo=medium" alt="Read Story on Medium">
</a>
</div>
""")
with gr.Accordion("ℹ️ How to Use This App", open=False):
gr.Markdown("""
This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:
1. **Enter a URL:** Provide the URL of the web page you want to analyze.
2. **Define Your Query:** Specify the exact information you're looking for (e.g., product name, price, customer ratings).
3. **Scrape the Web Page:** Choose a scraper and click the "Scrape Website" button to extract the content of the page.
4. **Select Model & Provider:** Choose the LLM model you want to use for information extraction.
5. **Extract Info by LLM:** Click the "Extract Info by LLM" button to get the information based on your query.
---
**What makes this different from a regular web scraper?**
Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.
""")
with gr.Column():
gr.HTML("""
<div style="padding: 12px; border: 1px solid #d32f2f; background-color: #ffebee; border-radius: 8px; margin-bottom: 15px;">
<p style="margin: 0; color: #c62828; font-weight: 500;">
⚠️ <code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">Disclaimer:</code>. Please be responsible when scraping websites. Users must comply with the terms of service of any website they scrape and respect
<code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">robots.txt</code>.
The developers of this tool are not liable for any misuse.
</p>
</div>
""")
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", autofocus=True)
query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating etc. / Summarize the content of this page")
with gr.Row():
scraper_dropdown = gr.Dropdown(
label="Select Scraper",
choices=["Scrape with Crawl4AI", "Scrape with FireCrawl"],
value="Scrape with Crawl4AI"
)
scrape_btn = gr.Button("Scrape Website")
clear_btn = gr.Button("Clear")
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
gr.Markdown("### 🧠 LLM Extraction")
gr.Markdown("Use a language model to extract structured information from the scraped content.")
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
with gr.Row():
# Add a single dropdown for model and provider selection
model_provider_dropdown = gr.Dropdown(
label="Select Model & Provider",
choices=[
"gemini-2.5-flash-lite (google_genai)",
"gemini-2.5-pro (google_genai)",
"gemini-2.5-flash (google_genai)",
"bytedance/seed-oss-36b-instruct (nvidia)",
"deepseek-ai/deepseek-v3.1 (nvidia)",
"qwen/qwen3-next-80b-a3b-instruct (nvidia)",
],
value="gemini-2.5-flash-lite (google_genai)"
)
llm_response_btn = gr.Button("Extract Info by LLM")
cancel_btn = gr.Button("Cancel", variant="stop")
# LLM response output area and loader
llm_response = gr.Markdown(
"",
label="LLM Response",
show_copy_button=True,
visible=True
)
# Removed custom loader; Gradio will show a spinner automatically during processing.
scrape_event = scrape_btn.click(
fn=scrape_website,
inputs=[url_input, scraper_dropdown],
outputs=[scrape_result_textbox],
)
# Clear button functionality
clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response])
llm_event = llm_response_btn.click(
fn=llm_response_wrapper,
inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
outputs=llm_response
)
cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
gradio_ui.launch(server_name="0.0.0.0")
|