Spaces:

frkhan
/

llm-web-scrapper

Running

App Files Files Community

llm-web-scrapper / app.py

frkhan

- Implemented scraping using Crawl4Ai.

255e074 about 1 month ago

raw

history blame

7.43 kB

	import gradio as gr

	import firecrawl_client
	import crawl4ai_client
	import llm_inference_service

	def parse_model_provider(selection):
	# Expected format: "<model_name> (<provider>)"
	if "(" in selection and ")" in selection:
	model = selection.split(" (")[0].strip()
	provider = selection.split(" (")[1].replace(")", "").strip()
	return model, provider
	raise ValueError(f"Invalid selection format: {selection}")

	def llm_response_wrapper(query, scrape_result, model_provider_selection):
	model, provider = parse_model_provider(model_provider_selection)
	result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
	if not result or (isinstance(result, str) and result.strip() == ""):
	return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
	return result

	async def scrape_website(url, scraper_selection):
	try:
	if scraper_selection == "Scrape with FireCrawl":
	return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
	elif scraper_selection == "Scrape with Crawl4AI":
	return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
	else:
	return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
	except Exception as e:
	return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"

	#Gradio UI
	with gr.Blocks() as gradio_ui:
	gr.HTML("""
	<div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
	<h1 style="margin: 0;"> LLM Web Scraper</h1>
	<div style="display: flex; gap: 10px;">
	<a href="https://github.com/langchain-ai/langchain" target="_blank">
	<img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
	</a>
	<a href="https://ai.google.dev/gemini-api/docs" target="_blank">
	<img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
	</a>
	<a href="https://build.nvidia.com/models" target="_blank">
	<img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
	</a>
	<a href="https://firecrawl.dev/" target="_blank">
	<img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
	</a>
	<a href="https://docs.crawl4ai.com/" target="_blank">
	<img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
	</a>

	</div>
	</div>
	""")

	gr.HTML("""
	<div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
	<span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
	<a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
	<img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
	</a>
	</div>
	""")

	gr.HTML("""
	<div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
	<h2 style="margin-top: 0;">How to Use This App</h2>
	<p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
	<ol>
	<li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
	<li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
	<li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
	<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
	<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
	</ol>

	<br />
	<br />

	<p><strong>What makes this different from a regular web scraper?</strong> </p>

	<p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
	</div>
	""")


	with gr.Column():
	url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
	# search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
	query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)

	with gr.Row():
	scraper_dropdown = gr.Dropdown(
	label="Select Scraper",
	choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
	value="Scrape with FireCrawl"
	)
	scrape_btn = gr.Button("Scrape Website")
	scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)

	gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
	gr.Markdown("### 🧠 LLM Extraction")
	gr.Markdown("Use a language model to extract structured information from the scraped content.")
	gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")


	with gr.Row():

	# Add a single dropdown for model and provider selection
	model_provider_dropdown = gr.Dropdown(
	label="Select Model & Provider",
	choices=[
	"gemini-2.5-flash-lite (google_genai)",
	"gemini-2.5-pro (google_genai)",
	"gemini-2.5-flash (google_genai)",
	"bytedance/seed-oss-36b-instruct (nvidia)",
	"deepseek-ai/deepseek-v3.1 (nvidia)",
	"qwen/qwen3-next-80b-a3b-instruct (nvidia)",
	],
	value="gemini-2.5-flash-lite (google_genai)"
	)


	llm_response_btn = gr.Button("Extract Info by LLM")
	cancel_btn = gr.Button("Cancel", variant="stop")


	# LLM response output area and loader
	llm_response = gr.Markdown(
	"\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum
	label="LLM Response",
	show_copy_button=True,
	visible=True
	)
	# Removed custom loader; Gradio will show a spinner automatically during processing.


	scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)

	llm_event = llm_response_btn.click(
	fn=llm_response_wrapper,
	inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
	outputs=llm_response
	)

	cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])

	gradio_ui.launch(server_name="0.0.0.0")