Spaces:
Running
Running
| """ | |
| This module sets up and runs the Gradio web interface for the LLM Web Scraper application. | |
| It orchestrates the UI components, event handling for scraping and LLM extraction, | |
| and integrates with backend services for scraping (FireCrawl, Crawl4AI) and | |
| LLM inference. It also initializes and uses Langfuse for tracing application performance. | |
| """ | |
| import gradio as gr | |
| import firecrawl_client | |
| import crawl4ai_client | |
| import llm_inference_service | |
| from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST | |
| from langfuse import Langfuse, get_client | |
| # Initialize Langfuse if configured | |
| langfuse = None | |
| if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY: | |
| Langfuse( | |
| public_key=LANGFUSE_PUBLIC_KEY, | |
| secret_key=LANGFUSE_SECRET_KEY, | |
| host=LANGFUSE_HOST | |
| ) | |
| langfuse = get_client() | |
| def parse_model_provider(selection): | |
| """ | |
| Parses a model and provider from a selection string. | |
| The expected format is "<model_name> (<provider>)". | |
| Args: | |
| selection (str): The string to parse. | |
| Returns: | |
| tuple[str, str]: A tuple containing the model name and provider. | |
| Raises: | |
| ValueError: If the selection string is not in the expected format. | |
| """ | |
| if "(" in selection and ")" in selection: | |
| model = selection.split(" (")[0].strip() | |
| provider = selection.split(" (")[1].replace(")", "").strip() | |
| return model, provider | |
| raise ValueError(f"Invalid selection format: {selection}") | |
| def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)): | |
| """ | |
| A generator function that wraps the LLM inference call for the Gradio UI. | |
| It yields an initial status message, calls the LLM service to extract information, | |
| and then yields the final result or an error message. | |
| Args: | |
| query (str): The user's query for information extraction. | |
| scrape_result (str): The scraped markdown content from the website. | |
| model_provider_selection (str): The selected model and provider string. | |
| progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True). | |
| Yields: | |
| str: Status messages and the final LLM response as a markdown string. | |
| """ | |
| yield "⏳ Generating response... Please wait." | |
| model, provider = parse_model_provider(model_provider_selection) | |
| result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider) | |
| if not result or (isinstance(result, str) and result.strip() == ""): | |
| yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>" | |
| yield result | |
| async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)): | |
| """An async generator that scrapes a website based on user selection for the Gradio UI. | |
| This function yields an initial status message, then performs the web scraping | |
| using the selected tool (FireCrawl or Crawl4AI). If Langfuse is configured, | |
| it wraps the scraping operation in a trace for observability. | |
| Args: | |
| url (str): The URL of the website to scrape. | |
| scraper_selection (str): The scraping tool selected by the user. | |
| progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True). | |
| Yields: | |
| str: A status message, followed by the scraped markdown content or an error message. | |
| """ | |
| # 1. First, yield an update to show the loading state and hide the old image. | |
| yield "⏳ Scraping website... Please wait." | |
| markdown = "" | |
| if not langfuse: | |
| try: | |
| if scraper_selection == "Scrape with FireCrawl": | |
| markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url) | |
| elif scraper_selection == "Scrape with Crawl4AI": | |
| markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url) | |
| else: | |
| markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>" | |
| except Exception as e: | |
| markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>" | |
| yield markdown | |
| return | |
| with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span: | |
| try: | |
| if scraper_selection == "Scrape with FireCrawl": | |
| markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url) | |
| elif scraper_selection == "Scrape with Crawl4AI": | |
| markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url) | |
| else: | |
| markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>" | |
| span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"}) | |
| except Exception as e: | |
| markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>" | |
| span.update_trace(output={"error": str(e), "status": "Error"}) | |
| yield markdown | |
| #Gradio UI | |
| # This block defines the entire Gradio user interface, including layout and component interactions. | |
| with gr.Blocks() as gradio_ui: | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;"> | |
| <h1 style="margin: 0;"> LLM Web Scraper</h1> | |
| <div style="display: flex; gap: 10px;"> | |
| <a href="https://www.langchain.com/" target="_blank"> | |
| <img src="https://img.shields.io/badge/LangChain-blue?style=for-the-badge&logo=langchain" alt="LangChain"> | |
| </a> | |
| <a href="https://ai.google.dev/gemini-api/docs" target="_blank"> | |
| <img src="https://img.shields.io/badge/Gemini-white?style=for-the-badge&logo=google-gemini" alt="Gemini API"> | |
| </a> | |
| <a href="https://build.nvidia.com/models" target="_blank"> | |
| <img src="https://img.shields.io/badge/NVIDIA-gray?style=for-the-badge&logo=nvidia" alt="NVIDIA NIM"> | |
| </a> | |
| <a href="https://firecrawl.dev/" target="_blank"> | |
| <img src="https://img.shields.io/badge/FireCrawl-orange?style=for-the-badge&logo=fire" alt="FireCrawl"> | |
| </a> | |
| <a href="https://docs.crawl4ai.com/" target="_blank"> | |
| <img src="https://img.shields.io/badge/Crawl4AI-blueviolet?style=for-the-badge&logo=github" alt="Crawl4AI"> | |
| </a> | |
| <a href="https://playwright.dev/" target="_blank"> | |
| <img src="https://img.shields.io/badge/Playwright-brightgreen?style=for-the-badge&logo=playwright" alt="Playwright"> | |
| </a> | |
| <a href="https://langfuse.com/" target="_blank"> | |
| <img src="https://img.shields.io/badge/Langfuse-blue?style=for-the-badge&logo=langfuse" alt="Langfuse"> | |
| </a> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;"> | |
| <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span> | |
| <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank"> | |
| <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo"> | |
| </a> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;"> | |
| <span style="font-size: 16px;">📖 <strong>Read the full story:</strong></span> | |
| <a href="https://medium.com/@frkhan/from-broken-selectors-to-intelligent-scraping-a-journey-into-llm-powered-web-automation-fc76d5fe2dbc" target="_blank"> | |
| <img src="https://img.shields.io/badge/Medium-Read%20Story-black?style=for-the-badge&logo=medium" alt="Read Story on Medium"> | |
| </a> | |
| </div> | |
| """) | |
| with gr.Accordion("ℹ️ How to Use This App", open=False): | |
| gr.Markdown(""" | |
| This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works: | |
| 1. **Enter a URL:** Provide the URL of the web page you want to analyze. | |
| 2. **Define Your Query:** Specify the exact information you're looking for (e.g., product name, price, customer ratings). | |
| 3. **Scrape the Web Page:** Choose a scraper and click the "Scrape Website" button to extract the content of the page. | |
| 4. **Select Model & Provider:** Choose the LLM model you want to use for information extraction. | |
| 5. **Extract Info by LLM:** Click the "Extract Info by LLM" button to get the information based on your query. | |
| --- | |
| **What makes this different from a regular web scraper?** | |
| Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance. | |
| """) | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div style="padding: 12px; border: 1px solid #d32f2f; background-color: #ffebee; border-radius: 8px; margin-bottom: 15px;"> | |
| <p style="margin: 0; color: #c62828; font-weight: 500;"> | |
| ⚠️ <code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">Disclaimer:</code>. Please be responsible when scraping websites. Users must comply with the terms of service of any website they scrape and respect | |
| <code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">robots.txt</code>. | |
| The developers of this tool are not liable for any misuse. | |
| </p> | |
| </div> | |
| """) | |
| url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", autofocus=True) | |
| query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating etc. / Summarize the content of this page") | |
| with gr.Row(): | |
| scraper_dropdown = gr.Dropdown( | |
| label="Select Scraper", | |
| choices=["Scrape with Crawl4AI", "Scrape with FireCrawl"], | |
| value="Scrape with Crawl4AI" | |
| ) | |
| scrape_btn = gr.Button("Scrape Website") | |
| clear_btn = gr.Button("Clear") | |
| scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True) | |
| gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>") | |
| gr.Markdown("### 🧠 LLM Extraction") | |
| gr.Markdown("Use a language model to extract structured information from the scraped content.") | |
| gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>") | |
| with gr.Row(): | |
| # Add a single dropdown for model and provider selection | |
| model_provider_dropdown = gr.Dropdown( | |
| label="Select Model & Provider", | |
| choices=[ | |
| "gemini-2.5-flash-lite (google_genai)", | |
| "gemini-2.5-pro (google_genai)", | |
| "gemini-2.5-flash (google_genai)", | |
| "bytedance/seed-oss-36b-instruct (nvidia)", | |
| "deepseek-ai/deepseek-v3.1 (nvidia)", | |
| "qwen/qwen3-next-80b-a3b-instruct (nvidia)", | |
| ], | |
| value="gemini-2.5-flash-lite (google_genai)" | |
| ) | |
| llm_response_btn = gr.Button("Extract Info by LLM") | |
| cancel_btn = gr.Button("Cancel", variant="stop") | |
| # LLM response output area and loader | |
| llm_response = gr.Markdown( | |
| "", | |
| label="LLM Response", | |
| show_copy_button=True, | |
| visible=True | |
| ) | |
| # Removed custom loader; Gradio will show a spinner automatically during processing. | |
| scrape_event = scrape_btn.click( | |
| fn=scrape_website, | |
| inputs=[url_input, scraper_dropdown], | |
| outputs=[scrape_result_textbox], | |
| ) | |
| # Clear button functionality | |
| clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response]) | |
| llm_event = llm_response_btn.click( | |
| fn=llm_response_wrapper, | |
| inputs=[query_input, scrape_result_textbox, model_provider_dropdown], | |
| outputs=llm_response | |
| ) | |
| cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event]) | |
| gradio_ui.launch(server_name="0.0.0.0") | |