Spaces:

frkhan
/

llm-web-scrapper

Running

App Files Files Community

frkhan commited on Sep 29

Commit

2e4cb4f

1 Parent(s): 483c169

-- Updated the main UI and system prompt so that LLM can provide data for non-product based info also.

Browse files

Files changed (3) hide show

app.py +21 -4
llm_inference_service.py +3 -0
requirements.txt +6 -20

app.py CHANGED Viewed

@@ -53,6 +53,21 @@ with gr.Blocks() as gradio_ui:
     </div>
     """)
     with gr.Column():
         url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
@@ -62,9 +77,11 @@ with gr.Blocks() as gradio_ui:
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
-        label_llm_section = gr.Label("Use LLM to extract information from the scraped content")
-        gr.HTML("<hr>")
     with gr.Row():
@@ -83,7 +100,7 @@ with gr.Blocks() as gradio_ui:
         )
-        llm_response_btn = gr.Button("Extracted Info by LLM")
     # LLM response output area and loader

     </div>
     """)
+    gr.HTML("""
+    <div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
+        <h2 style="margin-top: 0;">How to Use This App</h2>
+        <p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
+        <ol>
+            <li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
+            <li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
+            <li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
+            <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
+            <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
+        </ol>
+        <p><strong>What makes this different from a regular web scraper?</strong> Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
+    </div>
+    """)
     with gr.Column():
         url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
+        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
+        gr.Markdown("### 🧠 LLM Extraction")
+        gr.Markdown("Use a language model to extract structured information from the scraped content.")
+        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
     with gr.Row():
         )
+        llm_response_btn = gr.Button("Extract Info by LLM")
     # LLM response output area and loader

llm_inference_service.py CHANGED Viewed

@@ -19,6 +19,9 @@ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, mod
     If user asks for JSON format, please provide the answer in JSON format only.
     If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
     If no Context is given, simply state "No relevant information found to answer your question."

     If user asks for JSON format, please provide the answer in JSON format only.
+    User will mostly request you to extract product information but can also ask you to extract other information from the content.
+    So always read the user query carefully and extract information accordingly.
     If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
     If no Context is given, simply state "No relevant information found to answer your question."

requirements.txt CHANGED Viewed

@@ -1,21 +1,7 @@
-# gradio==5.46.1
-# langchain==0.3.27
-# langchain-community==0.3.29
-# chromadb==1.1.0
-# PyMuPDF==1.26.4
-# langchain-google-genai==2.1.12
-# langchain-nvidia-ai-endpoints==0.3.18
-# dotenv==0.9.9
-gradio
-requests
-# python-dotenv
-dotenv
-firecrawl-py
 langchain-community
-langchain-google-genai
-langchain-nvidia-ai-endpoints
-# groq

+gradio==5.46.1
+requests==2.32.5
+dotenv==0.9.9
+firecrawl-py==4.3.6
 langchain-community
+langchain-google-genai==2.1.12
+langchain-nvidia-ai-endpoints==0.3.18