Spaces:
Running
Running
- Implemented scraping using Crawl4Ai.
Browse files- Added error and exception handling.
- Using mcr.microsoft.com/playwright/python:v1.54.0-noble playwright docker image to build and install necessary packages to run playwright with crawl4Ai.
-- Added asyncio, crawl4ai, playwright dependencies.
- .dockerignore +4 -0
- Dockerfile +7 -9
- Dockerfile.dev +19 -0
- app.py +32 -5
- crawl4ai_client.py +14 -0
- docker-compose.dev.yml +1 -1
- firecrawl_client.py +8 -3
- requirements.txt +3 -1
.dockerignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
|
| 3 |
+
*.log
|
| 4 |
+
*.db
|
Dockerfile
CHANGED
|
@@ -1,21 +1,19 @@
|
|
| 1 |
-
|
| 2 |
-
FROM python:3.10-slim-bookworm
|
| 3 |
|
| 4 |
-
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
# Upgrade system packages to patch vulnerabilities
|
| 8 |
RUN apt-get update && apt-get upgrade -y && apt-get clean
|
| 9 |
|
| 10 |
-
# Copy requirements and install dependencies
|
| 11 |
COPY requirements.txt .
|
| 12 |
-
RUN pip install --
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
# Copy app code
|
| 15 |
COPY . .
|
| 16 |
|
| 17 |
-
# Expose Gradio default port
|
| 18 |
EXPOSE 7860
|
| 19 |
|
| 20 |
-
# Run the app
|
| 21 |
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
|
|
|
|
| 2 |
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
RUN apt-get update && apt-get upgrade -y && apt-get clean
|
| 6 |
|
|
|
|
| 7 |
COPY requirements.txt .
|
| 8 |
+
RUN pip install --break-system-packages -r requirements.txt
|
| 9 |
+
RUN python -m playwright install --with-deps chromium
|
| 10 |
+
|
| 11 |
+
# RUN pip install watchfiles
|
| 12 |
|
|
|
|
| 13 |
COPY . .
|
| 14 |
|
|
|
|
| 15 |
EXPOSE 7860
|
| 16 |
|
|
|
|
| 17 |
CMD ["python", "app.py"]
|
| 18 |
+
|
| 19 |
+
|
Dockerfile.dev
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get upgrade -y && apt-get clean
|
| 6 |
+
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN pip install --break-system-packages -r requirements.txt
|
| 9 |
+
RUN python -m playwright install --with-deps chromium
|
| 10 |
+
|
| 11 |
+
# RUN pip install watchfiles
|
| 12 |
+
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
EXPOSE 7860
|
| 16 |
+
|
| 17 |
+
CMD ["python", "app.py"]
|
| 18 |
+
|
| 19 |
+
|
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
import firecrawl_client
|
|
|
|
| 4 |
import llm_inference_service
|
| 5 |
|
| 6 |
def parse_model_provider(selection):
|
|
@@ -18,6 +19,17 @@ def llm_response_wrapper(query, scrape_result, model_provider_selection):
|
|
| 18 |
return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
|
| 19 |
return result
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
#Gradio UI
|
| 22 |
with gr.Blocks() as gradio_ui:
|
| 23 |
gr.HTML("""
|
|
@@ -36,7 +48,7 @@ with gr.Blocks() as gradio_ui:
|
|
| 36 |
<a href="https://firecrawl.dev/" target="_blank">
|
| 37 |
<img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
|
| 38 |
</a>
|
| 39 |
-
<a href="https://
|
| 40 |
<img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
|
| 41 |
</a>
|
| 42 |
|
|
@@ -64,7 +76,13 @@ with gr.Blocks() as gradio_ui:
|
|
| 64 |
<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
|
| 65 |
<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
|
| 66 |
</ol>
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
</div>
|
| 69 |
""")
|
| 70 |
|
|
@@ -73,8 +91,14 @@ with gr.Blocks() as gradio_ui:
|
|
| 73 |
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
|
| 74 |
# search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
|
| 75 |
query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
|
| 76 |
-
scrape_btn = gr.Button("Scrape with FireCrawl")
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 79 |
|
| 80 |
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
|
@@ -101,6 +125,7 @@ with gr.Blocks() as gradio_ui:
|
|
| 101 |
|
| 102 |
|
| 103 |
llm_response_btn = gr.Button("Extract Info by LLM")
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
# LLM response output area and loader
|
|
@@ -113,12 +138,14 @@ with gr.Blocks() as gradio_ui:
|
|
| 113 |
# Removed custom loader; Gradio will show a spinner automatically during processing.
|
| 114 |
|
| 115 |
|
| 116 |
-
scrape_btn.click(fn=
|
| 117 |
|
| 118 |
-
llm_response_btn.click(
|
| 119 |
fn=llm_response_wrapper,
|
| 120 |
inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
|
| 121 |
outputs=llm_response
|
| 122 |
)
|
|
|
|
|
|
|
| 123 |
|
| 124 |
gradio_ui.launch(server_name="0.0.0.0")
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
import firecrawl_client
|
| 4 |
+
import crawl4ai_client
|
| 5 |
import llm_inference_service
|
| 6 |
|
| 7 |
def parse_model_provider(selection):
|
|
|
|
| 19 |
return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
|
| 20 |
return result
|
| 21 |
|
| 22 |
+
async def scrape_website(url, scraper_selection):
|
| 23 |
+
try:
|
| 24 |
+
if scraper_selection == "Scrape with FireCrawl":
|
| 25 |
+
return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
|
| 26 |
+
elif scraper_selection == "Scrape with Crawl4AI":
|
| 27 |
+
return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
|
| 28 |
+
else:
|
| 29 |
+
return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
|
| 30 |
+
except Exception as e:
|
| 31 |
+
return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
|
| 32 |
+
|
| 33 |
#Gradio UI
|
| 34 |
with gr.Blocks() as gradio_ui:
|
| 35 |
gr.HTML("""
|
|
|
|
| 48 |
<a href="https://firecrawl.dev/" target="_blank">
|
| 49 |
<img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
|
| 50 |
</a>
|
| 51 |
+
<a href="https://docs.crawl4ai.com/" target="_blank">
|
| 52 |
<img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
|
| 53 |
</a>
|
| 54 |
|
|
|
|
| 76 |
<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
|
| 77 |
<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
|
| 78 |
</ol>
|
| 79 |
+
|
| 80 |
+
<br />
|
| 81 |
+
<br />
|
| 82 |
+
|
| 83 |
+
<p><strong>What makes this different from a regular web scraper?</strong> </p>
|
| 84 |
+
|
| 85 |
+
<p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
|
| 86 |
</div>
|
| 87 |
""")
|
| 88 |
|
|
|
|
| 91 |
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
|
| 92 |
# search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
|
| 93 |
query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
|
|
|
|
| 94 |
|
| 95 |
+
with gr.Row():
|
| 96 |
+
scraper_dropdown = gr.Dropdown(
|
| 97 |
+
label="Select Scraper",
|
| 98 |
+
choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
|
| 99 |
+
value="Scrape with FireCrawl"
|
| 100 |
+
)
|
| 101 |
+
scrape_btn = gr.Button("Scrape Website")
|
| 102 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 103 |
|
| 104 |
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
llm_response_btn = gr.Button("Extract Info by LLM")
|
| 128 |
+
cancel_btn = gr.Button("Cancel", variant="stop")
|
| 129 |
|
| 130 |
|
| 131 |
# LLM response output area and loader
|
|
|
|
| 138 |
# Removed custom loader; Gradio will show a spinner automatically during processing.
|
| 139 |
|
| 140 |
|
| 141 |
+
scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
|
| 142 |
|
| 143 |
+
llm_event = llm_response_btn.click(
|
| 144 |
fn=llm_response_wrapper,
|
| 145 |
inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
|
| 146 |
outputs=llm_response
|
| 147 |
)
|
| 148 |
+
|
| 149 |
+
cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
|
| 150 |
|
| 151 |
gradio_ui.launch(server_name="0.0.0.0")
|
crawl4ai_client.py
CHANGED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from crawl4ai import AsyncWebCrawler
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
|
| 6 |
+
try:
|
| 7 |
+
async with AsyncWebCrawler() as crawler:
|
| 8 |
+
result = await crawler.arun(url=url)
|
| 9 |
+
if result and result.markdown:
|
| 10 |
+
return result.markdown
|
| 11 |
+
# If result is None or markdown is empty
|
| 12 |
+
return "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
|
| 13 |
+
except Exception as e:
|
| 14 |
+
return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
|
docker-compose.dev.yml
CHANGED
|
@@ -4,7 +4,7 @@ services:
|
|
| 4 |
semantic-search-app:
|
| 5 |
build:
|
| 6 |
context: .
|
| 7 |
-
dockerfile: Dockerfile
|
| 8 |
container_name: llm-web-scrapper
|
| 9 |
ports:
|
| 10 |
- "12200:7860"
|
|
|
|
| 4 |
semantic-search-app:
|
| 5 |
build:
|
| 6 |
context: .
|
| 7 |
+
dockerfile: Dockerfile.dev # Use the development Dockerfile for local development
|
| 8 |
container_name: llm-web-scrapper
|
| 9 |
ports:
|
| 10 |
- "12200:7860"
|
firecrawl_client.py
CHANGED
|
@@ -25,9 +25,14 @@ def get_markdown_from_documents(docs: list[Document]) -> str:
|
|
| 25 |
|
| 26 |
|
| 27 |
def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
|
| 28 |
+
try:
|
| 29 |
+
docs = scrape_with_firecrawl(url)
|
| 30 |
+
if not docs:
|
| 31 |
+
return "❌ <span style='color:red;'>FireCrawl completed but returned no content. The page might be empty or inaccessible.</span>"
|
| 32 |
+
markdown = get_markdown_from_documents(docs)
|
| 33 |
+
return markdown
|
| 34 |
+
except Exception as e:
|
| 35 |
+
return f"❌ <span style='color:red;'>An error occurred while scraping with FireCrawl: {e}</span>"
|
| 36 |
|
| 37 |
|
| 38 |
|
requirements.txt
CHANGED
|
@@ -4,4 +4,6 @@ dotenv==0.9.9
|
|
| 4 |
firecrawl-py==4.3.6
|
| 5 |
langchain-community
|
| 6 |
langchain-google-genai==2.1.12
|
| 7 |
-
langchain-nvidia-ai-endpoints==0.3.18
|
|
|
|
|
|
|
|
|
| 4 |
firecrawl-py==4.3.6
|
| 5 |
langchain-community
|
| 6 |
langchain-google-genai==2.1.12
|
| 7 |
+
langchain-nvidia-ai-endpoints==0.3.18
|
| 8 |
+
asyncio
|
| 9 |
+
crawl4ai
|