import gradio as gr import firecrawl_client import llm_inference_service def parse_model_provider(selection): # Expected format: " ()" if "(" in selection and ")" in selection: model = selection.split(" (")[0].strip() provider = selection.split(" (")[1].replace(")", "").strip() return model, provider raise ValueError(f"Invalid selection format: {selection}") def llm_response_wrapper(query, scrape_result, model_provider_selection): model, provider = parse_model_provider(model_provider_selection) result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider) if not result or (isinstance(result, str) and result.strip() == ""): return "❌ No information could be extracted from the scraped content. Please check your query or try a different model/provider." return result #Gradio UI with gr.Blocks() as gradio_ui: gr.HTML("""

LLM Web Scraper

LangChain Gemini API NVIDIA NIM FireCrawl Crawl4AI
""") gr.HTML("""
📦 Download the full source code: GitHub Repo
""") gr.HTML("""

How to Use This App

This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:

  1. Enter a URL: Provide the URL of the web page you want to analyze.
  2. Define Your Query: Specify the exact information you're looking for (e.g., product name, price, customer ratings).
  3. Scrape the Web Page: Click the "Scrape with FireCrawl" button to extract the content of the page.
  4. Select Model & Provider: Choose the LLM model you want to use for information extraction.
  5. Extract Info by LLM: Click the "Extract Info by LLM" button to get the information based on your query.

What makes this different from a regular web scraper? Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to understand your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.

""") with gr.Column(): url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1) # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1) query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1) scrape_btn = gr.Button("Scrape with FireCrawl") scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True) gr.HTML("
") gr.Markdown("### 🧠 LLM Extraction") gr.Markdown("Use a language model to extract structured information from the scraped content.") gr.HTML("
") with gr.Row(): # Add a single dropdown for model and provider selection model_provider_dropdown = gr.Dropdown( label="Select Model & Provider", choices=[ "gemini-2.5-flash-lite (google_genai)", "gemini-2.5-pro (google_genai)", "gemini-2.5-flash (google_genai)", "bytedance/seed-oss-36b-instruct (nvidia)", "deepseek-ai/deepseek-v3.1 (nvidia)", "qwen/qwen3-next-80b-a3b-instruct (nvidia)", ], value="gemini-2.5-flash-lite (google_genai)" ) llm_response_btn = gr.Button("Extract Info by LLM") # LLM response output area and loader llm_response = gr.Markdown( "\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum label="LLM Response", show_copy_button=True, visible=True ) # Removed custom loader; Gradio will show a spinner automatically during processing. scrape_btn.click(fn=firecrawl_client.scrape_and_get_markdown_with_firecrawl, inputs=url_input, outputs=scrape_result_textbox) llm_response_btn.click( fn=llm_response_wrapper, inputs=[query_input, scrape_result_textbox, model_provider_dropdown], outputs=llm_response ) gradio_ui.launch(server_name="0.0.0.0")