Spaces:
Running
Running
| import gradio as gr | |
| import firecrawl_client | |
| import crawl4ai_client | |
| import llm_inference_service | |
| def parse_model_provider(selection): | |
| # Expected format: "<model_name> (<provider>)" | |
| if "(" in selection and ")" in selection: | |
| model = selection.split(" (")[0].strip() | |
| provider = selection.split(" (")[1].replace(")", "").strip() | |
| return model, provider | |
| raise ValueError(f"Invalid selection format: {selection}") | |
| def llm_response_wrapper(query, scrape_result, model_provider_selection): | |
| model, provider = parse_model_provider(model_provider_selection) | |
| result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider) | |
| if not result or (isinstance(result, str) and result.strip() == ""): | |
| return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>" | |
| return result | |
| async def scrape_website(url, scraper_selection): | |
| try: | |
| if scraper_selection == "Scrape with FireCrawl": | |
| return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url) | |
| elif scraper_selection == "Scrape with Crawl4AI": | |
| return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url) | |
| else: | |
| return "❌ <span style='color:red;'>Invalid scraper selected.</span>" | |
| except Exception as e: | |
| return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>" | |
| #Gradio UI | |
| with gr.Blocks() as gradio_ui: | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;"> | |
| <h1 style="margin: 0;"> LLM Web Scraper</h1> | |
| <div style="display: flex; gap: 10px;"> | |
| <a href="https://github.com/langchain-ai/langchain" target="_blank"> | |
| <img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain"> | |
| </a> | |
| <a href="https://ai.google.dev/gemini-api/docs" target="_blank"> | |
| <img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API"> | |
| </a> | |
| <a href="https://build.nvidia.com/models" target="_blank"> | |
| <img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM"> | |
| </a> | |
| <a href="https://firecrawl.dev/" target="_blank"> | |
| <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl"> | |
| </a> | |
| <a href="https://docs.crawl4ai.com/" target="_blank"> | |
| <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI"> | |
| </a> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;"> | |
| <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span> | |
| <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank"> | |
| <img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo"> | |
| </a> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;"> | |
| <h2 style="margin-top: 0;">How to Use This App</h2> | |
| <p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p> | |
| <ol> | |
| <li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li> | |
| <li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li> | |
| <li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li> | |
| <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li> | |
| <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li> | |
| </ol> | |
| <br /> | |
| <br /> | |
| <p><strong>What makes this different from a regular web scraper?</strong> </p> | |
| <p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p> | |
| </div> | |
| """) | |
| with gr.Column(): | |
| url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1) | |
| # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1) | |
| query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1) | |
| with gr.Row(): | |
| scraper_dropdown = gr.Dropdown( | |
| label="Select Scraper", | |
| choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"], | |
| value="Scrape with FireCrawl" | |
| ) | |
| scrape_btn = gr.Button("Scrape Website") | |
| scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True) | |
| gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>") | |
| gr.Markdown("### 🧠 LLM Extraction") | |
| gr.Markdown("Use a language model to extract structured information from the scraped content.") | |
| gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>") | |
| with gr.Row(): | |
| # Add a single dropdown for model and provider selection | |
| model_provider_dropdown = gr.Dropdown( | |
| label="Select Model & Provider", | |
| choices=[ | |
| "gemini-2.5-flash-lite (google_genai)", | |
| "gemini-2.5-pro (google_genai)", | |
| "gemini-2.5-flash (google_genai)", | |
| "bytedance/seed-oss-36b-instruct (nvidia)", | |
| "deepseek-ai/deepseek-v3.1 (nvidia)", | |
| "qwen/qwen3-next-80b-a3b-instruct (nvidia)", | |
| ], | |
| value="gemini-2.5-flash-lite (google_genai)" | |
| ) | |
| llm_response_btn = gr.Button("Extract Info by LLM") | |
| cancel_btn = gr.Button("Cancel", variant="stop") | |
| # LLM response output area and loader | |
| llm_response = gr.Markdown( | |
| "\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum | |
| label="LLM Response", | |
| show_copy_button=True, | |
| visible=True | |
| ) | |
| # Removed custom loader; Gradio will show a spinner automatically during processing. | |
| scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox) | |
| llm_event = llm_response_btn.click( | |
| fn=llm_response_wrapper, | |
| inputs=[query_input, scrape_result_textbox, model_provider_dropdown], | |
| outputs=llm_response | |
| ) | |
| cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event]) | |
| gradio_ui.launch(server_name="0.0.0.0") | |