frkhan commited on
Commit
255e074
·
1 Parent(s): 8c93d26

- Implemented scraping using Crawl4Ai.

Browse files

- Added error and exception handling.
- Using mcr.microsoft.com/playwright/python:v1.54.0-noble playwright docker image to build and install necessary packages to run playwright with crawl4Ai.
-- Added asyncio, crawl4ai, playwright dependencies.

.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+
3
+ *.log
4
+ *.db
Dockerfile CHANGED
@@ -1,21 +1,19 @@
1
- # Use official Python base image
2
- FROM python:3.10-slim-bookworm
3
 
4
- # Set working directory
5
  WORKDIR /app
6
 
7
- # Upgrade system packages to patch vulnerabilities
8
  RUN apt-get update && apt-get upgrade -y && apt-get clean
9
 
10
- # Copy requirements and install dependencies
11
  COPY requirements.txt .
12
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
13
 
14
- # Copy app code
15
  COPY . .
16
 
17
- # Expose Gradio default port
18
  EXPOSE 7860
19
 
20
- # Run the app
21
  CMD ["python", "app.py"]
 
 
 
1
+ FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
 
2
 
 
3
  WORKDIR /app
4
 
 
5
  RUN apt-get update && apt-get upgrade -y && apt-get clean
6
 
 
7
  COPY requirements.txt .
8
+ RUN pip install --break-system-packages -r requirements.txt
9
+ RUN python -m playwright install --with-deps chromium
10
+
11
+ # RUN pip install watchfiles
12
 
 
13
  COPY . .
14
 
 
15
  EXPOSE 7860
16
 
 
17
  CMD ["python", "app.py"]
18
+
19
+
Dockerfile.dev ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM mcr.microsoft.com/playwright/python:v1.54.0-noble
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get upgrade -y && apt-get clean
6
+
7
+ COPY requirements.txt .
8
+ RUN pip install --break-system-packages -r requirements.txt
9
+ RUN python -m playwright install --with-deps chromium
10
+
11
+ # RUN pip install watchfiles
12
+
13
+ COPY . .
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["python", "app.py"]
18
+
19
+
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
 
3
  import firecrawl_client
 
4
  import llm_inference_service
5
 
6
  def parse_model_provider(selection):
@@ -18,6 +19,17 @@ def llm_response_wrapper(query, scrape_result, model_provider_selection):
18
  return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
19
  return result
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  #Gradio UI
22
  with gr.Blocks() as gradio_ui:
23
  gr.HTML("""
@@ -36,7 +48,7 @@ with gr.Blocks() as gradio_ui:
36
  <a href="https://firecrawl.dev/" target="_blank">
37
  <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
38
  </a>
39
- <a href="https://github.com/crawl4ai/crawl4ai" target="_blank">
40
  <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
41
  </a>
42
 
@@ -64,7 +76,13 @@ with gr.Blocks() as gradio_ui:
64
  <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
65
  <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
66
  </ol>
67
- <p><strong>What makes this different from a regular web scraper?</strong> Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
 
 
 
 
 
 
68
  </div>
69
  """)
70
 
@@ -73,8 +91,14 @@ with gr.Blocks() as gradio_ui:
73
  url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
74
  # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
75
  query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
76
- scrape_btn = gr.Button("Scrape with FireCrawl")
77
 
 
 
 
 
 
 
 
78
  scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
79
 
80
  gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
@@ -101,6 +125,7 @@ with gr.Blocks() as gradio_ui:
101
 
102
 
103
  llm_response_btn = gr.Button("Extract Info by LLM")
 
104
 
105
 
106
  # LLM response output area and loader
@@ -113,12 +138,14 @@ with gr.Blocks() as gradio_ui:
113
  # Removed custom loader; Gradio will show a spinner automatically during processing.
114
 
115
 
116
- scrape_btn.click(fn=firecrawl_client.scrape_and_get_markdown_with_firecrawl, inputs=url_input, outputs=scrape_result_textbox)
117
 
118
- llm_response_btn.click(
119
  fn=llm_response_wrapper,
120
  inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
121
  outputs=llm_response
122
  )
 
 
123
 
124
  gradio_ui.launch(server_name="0.0.0.0")
 
1
  import gradio as gr
2
 
3
  import firecrawl_client
4
+ import crawl4ai_client
5
  import llm_inference_service
6
 
7
  def parse_model_provider(selection):
 
19
  return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
20
  return result
21
 
22
+ async def scrape_website(url, scraper_selection):
23
+ try:
24
+ if scraper_selection == "Scrape with FireCrawl":
25
+ return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
26
+ elif scraper_selection == "Scrape with Crawl4AI":
27
+ return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
28
+ else:
29
+ return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
30
+ except Exception as e:
31
+ return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
32
+
33
  #Gradio UI
34
  with gr.Blocks() as gradio_ui:
35
  gr.HTML("""
 
48
  <a href="https://firecrawl.dev/" target="_blank">
49
  <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
50
  </a>
51
+ <a href="https://docs.crawl4ai.com/" target="_blank">
52
  <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
53
  </a>
54
 
 
76
  <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
77
  <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
78
  </ol>
79
+
80
+ <br />
81
+ <br />
82
+
83
+ <p><strong>What makes this different from a regular web scraper?</strong> </p>
84
+
85
+ <p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
86
  </div>
87
  """)
88
 
 
91
  url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
92
  # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
93
  query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
 
94
 
95
+ with gr.Row():
96
+ scraper_dropdown = gr.Dropdown(
97
+ label="Select Scraper",
98
+ choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
99
+ value="Scrape with FireCrawl"
100
+ )
101
+ scrape_btn = gr.Button("Scrape Website")
102
  scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
103
 
104
  gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
 
125
 
126
 
127
  llm_response_btn = gr.Button("Extract Info by LLM")
128
+ cancel_btn = gr.Button("Cancel", variant="stop")
129
 
130
 
131
  # LLM response output area and loader
 
138
  # Removed custom loader; Gradio will show a spinner automatically during processing.
139
 
140
 
141
+ scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
142
 
143
+ llm_event = llm_response_btn.click(
144
  fn=llm_response_wrapper,
145
  inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
146
  outputs=llm_response
147
  )
148
+
149
+ cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
150
 
151
  gradio_ui.launch(server_name="0.0.0.0")
crawl4ai_client.py CHANGED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from crawl4ai import AsyncWebCrawler
3
+
4
+
5
+ async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
6
+ try:
7
+ async with AsyncWebCrawler() as crawler:
8
+ result = await crawler.arun(url=url)
9
+ if result and result.markdown:
10
+ return result.markdown
11
+ # If result is None or markdown is empty
12
+ return "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
13
+ except Exception as e:
14
+ return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
docker-compose.dev.yml CHANGED
@@ -4,7 +4,7 @@ services:
4
  semantic-search-app:
5
  build:
6
  context: .
7
- dockerfile: Dockerfile
8
  container_name: llm-web-scrapper
9
  ports:
10
  - "12200:7860"
 
4
  semantic-search-app:
5
  build:
6
  context: .
7
+ dockerfile: Dockerfile.dev # Use the development Dockerfile for local development
8
  container_name: llm-web-scrapper
9
  ports:
10
  - "12200:7860"
firecrawl_client.py CHANGED
@@ -25,9 +25,14 @@ def get_markdown_from_documents(docs: list[Document]) -> str:
25
 
26
 
27
  def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
28
- docs = scrape_with_firecrawl(url)
29
- markdown = get_markdown_from_documents(docs)
30
- return markdown
 
 
 
 
 
31
 
32
 
33
 
 
25
 
26
 
27
  def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
28
+ try:
29
+ docs = scrape_with_firecrawl(url)
30
+ if not docs:
31
+ return "❌ <span style='color:red;'>FireCrawl completed but returned no content. The page might be empty or inaccessible.</span>"
32
+ markdown = get_markdown_from_documents(docs)
33
+ return markdown
34
+ except Exception as e:
35
+ return f"❌ <span style='color:red;'>An error occurred while scraping with FireCrawl: {e}</span>"
36
 
37
 
38
 
requirements.txt CHANGED
@@ -4,4 +4,6 @@ dotenv==0.9.9
4
  firecrawl-py==4.3.6
5
  langchain-community
6
  langchain-google-genai==2.1.12
7
- langchain-nvidia-ai-endpoints==0.3.18
 
 
 
4
  firecrawl-py==4.3.6
5
  langchain-community
6
  langchain-google-genai==2.1.12
7
+ langchain-nvidia-ai-endpoints==0.3.18
8
+ asyncio
9
+ crawl4ai