Spaces:
Running
Running
File size: 7,426 Bytes
483c169 255e074 483c169 255e074 483c169 255e074 483c169 2e4cb4f 255e074 2e4cb4f 483c169 255e074 483c169 2e4cb4f 483c169 2e4cb4f 255e074 483c169 255e074 483c169 255e074 483c169 255e074 483c169 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
import firecrawl_client
import crawl4ai_client
import llm_inference_service
def parse_model_provider(selection):
# Expected format: "<model_name> (<provider>)"
if "(" in selection and ")" in selection:
model = selection.split(" (")[0].strip()
provider = selection.split(" (")[1].replace(")", "").strip()
return model, provider
raise ValueError(f"Invalid selection format: {selection}")
def llm_response_wrapper(query, scrape_result, model_provider_selection):
model, provider = parse_model_provider(model_provider_selection)
result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
if not result or (isinstance(result, str) and result.strip() == ""):
return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
return result
async def scrape_website(url, scraper_selection):
try:
if scraper_selection == "Scrape with FireCrawl":
return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
elif scraper_selection == "Scrape with Crawl4AI":
return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
else:
return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
except Exception as e:
return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
#Gradio UI
with gr.Blocks() as gradio_ui:
gr.HTML("""
<div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
<h1 style="margin: 0;"> LLM Web Scraper</h1>
<div style="display: flex; gap: 10px;">
<a href="https://github.com/langchain-ai/langchain" target="_blank">
<img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
</a>
<a href="https://ai.google.dev/gemini-api/docs" target="_blank">
<img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
</a>
<a href="https://build.nvidia.com/models" target="_blank">
<img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
</a>
<a href="https://firecrawl.dev/" target="_blank">
<img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
</a>
<a href="https://docs.crawl4ai.com/" target="_blank">
<img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
</a>
</div>
</div>
""")
gr.HTML("""
<div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
<span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
<a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
<img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
</a>
</div>
""")
gr.HTML("""
<div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h2 style="margin-top: 0;">How to Use This App</h2>
<p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
<ol>
<li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
<li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
<li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
</ol>
<br />
<br />
<p><strong>What makes this different from a regular web scraper?</strong> </p>
<p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
</div>
""")
with gr.Column():
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
# search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
with gr.Row():
scraper_dropdown = gr.Dropdown(
label="Select Scraper",
choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
value="Scrape with FireCrawl"
)
scrape_btn = gr.Button("Scrape Website")
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
gr.Markdown("### 🧠 LLM Extraction")
gr.Markdown("Use a language model to extract structured information from the scraped content.")
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
with gr.Row():
# Add a single dropdown for model and provider selection
model_provider_dropdown = gr.Dropdown(
label="Select Model & Provider",
choices=[
"gemini-2.5-flash-lite (google_genai)",
"gemini-2.5-pro (google_genai)",
"gemini-2.5-flash (google_genai)",
"bytedance/seed-oss-36b-instruct (nvidia)",
"deepseek-ai/deepseek-v3.1 (nvidia)",
"qwen/qwen3-next-80b-a3b-instruct (nvidia)",
],
value="gemini-2.5-flash-lite (google_genai)"
)
llm_response_btn = gr.Button("Extract Info by LLM")
cancel_btn = gr.Button("Cancel", variant="stop")
# LLM response output area and loader
llm_response = gr.Markdown(
"\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum
label="LLM Response",
show_copy_button=True,
visible=True
)
# Removed custom loader; Gradio will show a spinner automatically during processing.
scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
llm_event = llm_response_btn.click(
fn=llm_response_wrapper,
inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
outputs=llm_response
)
cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
gradio_ui.launch(server_name="0.0.0.0")
|