Spaces:

frkhan
/

llm-web-scrapper

Running

frkhan commited on Sep 30

Commit

255e074

1 Parent(s): 8c93d26

- Implemented scraping using Crawl4Ai.

- Added error and exception handling.
- Using mcr.microsoft.com/playwright/python:v1.54.0-noble playwright docker image to build and install necessary packages to run playwright with crawl4Ai.
-- Added asyncio, crawl4ai, playwright dependencies.

Files changed (8) hide show

.dockerignore +4 -0
Dockerfile +7 -9
Dockerfile.dev +19 -0
app.py +32 -5
crawl4ai_client.py +14 -0
docker-compose.dev.yml +1 -1
firecrawl_client.py +8 -3
requirements.txt +3 -1

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.log
+*.db

Dockerfile CHANGED Viewed

@@ -1,21 +1,19 @@
-# Use official Python base image
-FROM python:3.10-slim-bookworm
-# Set working directory
 WORKDIR /app
-# Upgrade system packages to patch vulnerabilities
 RUN apt-get update && apt-get upgrade -y && apt-get clean
-# Copy requirements and install dependencies
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy app code
 COPY . .
-# Expose Gradio default port
 EXPOSE 7860
-# Run the app
 CMD ["python", "app.py"]

+FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
 WORKDIR /app
 RUN apt-get update && apt-get upgrade -y && apt-get clean
 COPY requirements.txt .
+RUN pip install --break-system-packages -r requirements.txt
+RUN python -m playwright install --with-deps chromium
+# RUN pip install watchfiles
 COPY . .
 EXPOSE 7860
 CMD ["python", "app.py"]

Dockerfile.dev ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
+WORKDIR /app
+RUN apt-get update && apt-get upgrade -y && apt-get clean
+COPY requirements.txt .
+RUN pip install --break-system-packages -r requirements.txt
+RUN python -m playwright install --with-deps chromium
+# RUN pip install watchfiles
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import firecrawl_client
 import llm_inference_service
 def parse_model_provider(selection):
@@ -18,6 +19,17 @@ def llm_response_wrapper(query, scrape_result, model_provider_selection):
         return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
     return result
 #Gradio UI
 with gr.Blocks() as gradio_ui:
     gr.HTML("""
@@ -36,7 +48,7 @@ with gr.Blocks() as gradio_ui:
             <a href="https://firecrawl.dev/" target="_blank">
                 <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
             </a>
-            <a href="https://github.com/crawl4ai/crawl4ai" target="_blank">
                 <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
             </a>
@@ -64,7 +76,13 @@ with gr.Blocks() as gradio_ui:
             <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
             <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
         </ol>
-        <p><strong>What makes this different from a regular web scraper?</strong> Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
     </div>
     """)
@@ -73,8 +91,14 @@ with gr.Blocks() as gradio_ui:
         url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
         # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
         query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
-        scrape_btn = gr.Button("Scrape with FireCrawl")
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
         gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
@@ -101,6 +125,7 @@ with gr.Blocks() as gradio_ui:
         llm_response_btn = gr.Button("Extract Info by LLM")
     # LLM response output area and loader
@@ -113,12 +138,14 @@ with gr.Blocks() as gradio_ui:
     # Removed custom loader; Gradio will show a spinner automatically during processing.
-    scrape_btn.click(fn=firecrawl_client.scrape_and_get_markdown_with_firecrawl, inputs=url_input, outputs=scrape_result_textbox)
-    llm_response_btn.click(
         fn=llm_response_wrapper,
         inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
         outputs=llm_response
     )
 gradio_ui.launch(server_name="0.0.0.0")

 import gradio as gr
 import firecrawl_client
+import crawl4ai_client
 import llm_inference_service
 def parse_model_provider(selection):
         return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
     return result
+async def scrape_website(url, scraper_selection):
+    try:
+        if scraper_selection == "Scrape with FireCrawl":
+            return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
+        elif scraper_selection == "Scrape with Crawl4AI":
+            return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
+        else:
+            return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
+    except Exception as e:
+        return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
 #Gradio UI
 with gr.Blocks() as gradio_ui:
     gr.HTML("""
             <a href="https://firecrawl.dev/" target="_blank">
                 <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
             </a>
+            <a href="https://docs.crawl4ai.com/" target="_blank">
                 <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
             </a>
             <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
             <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
         </ol>
+        <br />
+        <br />
+        <p><strong>What makes this different from a regular web scraper?</strong>  </p>
+        <p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
     </div>
     """)
         url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
         # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
         query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
+        with gr.Row():
+            scraper_dropdown = gr.Dropdown(
+                label="Select Scraper",
+                choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
+                value="Scrape with FireCrawl"
+            )
+            scrape_btn = gr.Button("Scrape Website")
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
         gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
         llm_response_btn = gr.Button("Extract Info by LLM")
+        cancel_btn = gr.Button("Cancel", variant="stop")
     # LLM response output area and loader
     # Removed custom loader; Gradio will show a spinner automatically during processing.
+    scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
+    llm_event = llm_response_btn.click(
         fn=llm_response_wrapper,
         inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
         outputs=llm_response
     )
+    cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
 gradio_ui.launch(server_name="0.0.0.0")

crawl4ai_client.py CHANGED Viewed

	@@ -0,0 +1,14 @@

+import asyncio
+from crawl4ai import AsyncWebCrawler
+async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
+    try:
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url=url)
+            if result and result.markdown:
+                return result.markdown
+            # If result is None or markdown is empty
+            return "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
+    except Exception as e:
+        return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"

docker-compose.dev.yml CHANGED Viewed

@@ -4,7 +4,7 @@ services:
   semantic-search-app:
     build:
       context: .
-      dockerfile: Dockerfile
     container_name: llm-web-scrapper
     ports:
       - "12200:7860"

   semantic-search-app:
     build:
       context: .
+      dockerfile: Dockerfile.dev # Use the development Dockerfile for local development
     container_name: llm-web-scrapper
     ports:
       - "12200:7860"

firecrawl_client.py CHANGED Viewed

@@ -25,9 +25,14 @@ def get_markdown_from_documents(docs: list[Document]) -> str:
 def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
-    docs = scrape_with_firecrawl(url)
-    markdown = get_markdown_from_documents(docs)
-    return markdown

 def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
+    try:
+        docs = scrape_with_firecrawl(url)
+        if not docs:
+            return "❌ <span style='color:red;'>FireCrawl completed but returned no content. The page might be empty or inaccessible.</span>"
+        markdown = get_markdown_from_documents(docs)
+        return markdown
+    except Exception as e:
+        return f"❌ <span style='color:red;'>An error occurred while scraping with FireCrawl: {e}</span>"

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ dotenv==0.9.9
 firecrawl-py==4.3.6
 langchain-community
 langchain-google-genai==2.1.12
-langchain-nvidia-ai-endpoints==0.3.18

 firecrawl-py==4.3.6
 langchain-community
 langchain-google-genai==2.1.12
+langchain-nvidia-ai-endpoints==0.3.18
+asyncio
+crawl4ai