frkhan's picture
- Implemented scraping using Crawl4Ai.
255e074
raw
history blame
7.43 kB
import gradio as gr
import firecrawl_client
import crawl4ai_client
import llm_inference_service
def parse_model_provider(selection):
# Expected format: "<model_name> (<provider>)"
if "(" in selection and ")" in selection:
model = selection.split(" (")[0].strip()
provider = selection.split(" (")[1].replace(")", "").strip()
return model, provider
raise ValueError(f"Invalid selection format: {selection}")
def llm_response_wrapper(query, scrape_result, model_provider_selection):
model, provider = parse_model_provider(model_provider_selection)
result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
if not result or (isinstance(result, str) and result.strip() == ""):
return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
return result
async def scrape_website(url, scraper_selection):
try:
if scraper_selection == "Scrape with FireCrawl":
return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
elif scraper_selection == "Scrape with Crawl4AI":
return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
else:
return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
except Exception as e:
return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
#Gradio UI
with gr.Blocks() as gradio_ui:
gr.HTML("""
<div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
<h1 style="margin: 0;"> LLM Web Scraper</h1>
<div style="display: flex; gap: 10px;">
<a href="https://github.com/langchain-ai/langchain" target="_blank">
<img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
</a>
<a href="https://ai.google.dev/gemini-api/docs" target="_blank">
<img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
</a>
<a href="https://build.nvidia.com/models" target="_blank">
<img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
</a>
<a href="https://firecrawl.dev/" target="_blank">
<img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
</a>
<a href="https://docs.crawl4ai.com/" target="_blank">
<img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
</a>
</div>
</div>
""")
gr.HTML("""
<div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
<span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
<a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
<img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
</a>
</div>
""")
gr.HTML("""
<div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h2 style="margin-top: 0;">How to Use This App</h2>
<p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
<ol>
<li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
<li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
<li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
</ol>
<br />
<br />
<p><strong>What makes this different from a regular web scraper?</strong> </p>
<p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
</div>
""")
with gr.Column():
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
# search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
with gr.Row():
scraper_dropdown = gr.Dropdown(
label="Select Scraper",
choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
value="Scrape with FireCrawl"
)
scrape_btn = gr.Button("Scrape Website")
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
gr.Markdown("### 🧠 LLM Extraction")
gr.Markdown("Use a language model to extract structured information from the scraped content.")
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
with gr.Row():
# Add a single dropdown for model and provider selection
model_provider_dropdown = gr.Dropdown(
label="Select Model & Provider",
choices=[
"gemini-2.5-flash-lite (google_genai)",
"gemini-2.5-pro (google_genai)",
"gemini-2.5-flash (google_genai)",
"bytedance/seed-oss-36b-instruct (nvidia)",
"deepseek-ai/deepseek-v3.1 (nvidia)",
"qwen/qwen3-next-80b-a3b-instruct (nvidia)",
],
value="gemini-2.5-flash-lite (google_genai)"
)
llm_response_btn = gr.Button("Extract Info by LLM")
cancel_btn = gr.Button("Cancel", variant="stop")
# LLM response output area and loader
llm_response = gr.Markdown(
"\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum
label="LLM Response",
show_copy_button=True,
visible=True
)
# Removed custom loader; Gradio will show a spinner automatically during processing.
scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
llm_event = llm_response_btn.click(
fn=llm_response_wrapper,
inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
outputs=llm_response
)
cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
gradio_ui.launch(server_name="0.0.0.0")