Spaces:
Running
Running
-- Updated the main UI and system prompt so that LLM can provide data for non-product based info also.
Browse files- app.py +21 -4
- llm_inference_service.py +3 -0
- requirements.txt +6 -20
app.py
CHANGED
|
@@ -53,6 +53,21 @@ with gr.Blocks() as gradio_ui:
|
|
| 53 |
</div>
|
| 54 |
""")
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
with gr.Column():
|
| 58 |
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
|
|
@@ -62,9 +77,11 @@ with gr.Blocks() as gradio_ui:
|
|
| 62 |
|
| 63 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 64 |
|
| 65 |
-
|
| 66 |
-
gr.
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
|
| 69 |
with gr.Row():
|
| 70 |
|
|
@@ -83,7 +100,7 @@ with gr.Blocks() as gradio_ui:
|
|
| 83 |
)
|
| 84 |
|
| 85 |
|
| 86 |
-
llm_response_btn = gr.Button("
|
| 87 |
|
| 88 |
|
| 89 |
# LLM response output area and loader
|
|
|
|
| 53 |
</div>
|
| 54 |
""")
|
| 55 |
|
| 56 |
+
gr.HTML("""
|
| 57 |
+
<div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
| 58 |
+
<h2 style="margin-top: 0;">How to Use This App</h2>
|
| 59 |
+
<p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
|
| 60 |
+
<ol>
|
| 61 |
+
<li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
|
| 62 |
+
<li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
|
| 63 |
+
<li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
|
| 64 |
+
<li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
|
| 65 |
+
<li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
|
| 66 |
+
</ol>
|
| 67 |
+
<p><strong>What makes this different from a regular web scraper?</strong> Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
|
| 68 |
+
</div>
|
| 69 |
+
""")
|
| 70 |
+
|
| 71 |
|
| 72 |
with gr.Column():
|
| 73 |
url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
|
|
|
|
| 77 |
|
| 78 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 79 |
|
| 80 |
+
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
| 81 |
+
gr.Markdown("### 🧠 LLM Extraction")
|
| 82 |
+
gr.Markdown("Use a language model to extract structured information from the scraped content.")
|
| 83 |
+
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
| 84 |
+
|
| 85 |
|
| 86 |
with gr.Row():
|
| 87 |
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
|
| 103 |
+
llm_response_btn = gr.Button("Extract Info by LLM")
|
| 104 |
|
| 105 |
|
| 106 |
# LLM response output area and loader
|
llm_inference_service.py
CHANGED
|
@@ -19,6 +19,9 @@ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, mod
|
|
| 19 |
|
| 20 |
If user asks for JSON format, please provide the answer in JSON format only.
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
|
| 23 |
If no Context is given, simply state "No relevant information found to answer your question."
|
| 24 |
|
|
|
|
| 19 |
|
| 20 |
If user asks for JSON format, please provide the answer in JSON format only.
|
| 21 |
|
| 22 |
+
User will mostly request you to extract product information but can also ask you to extract other information from the content.
|
| 23 |
+
So always read the user query carefully and extract information accordingly.
|
| 24 |
+
|
| 25 |
If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
|
| 26 |
If no Context is given, simply state "No relevant information found to answer your question."
|
| 27 |
|
requirements.txt
CHANGED
|
@@ -1,21 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
# PyMuPDF==1.26.4
|
| 6 |
-
# langchain-google-genai==2.1.12
|
| 7 |
-
# langchain-nvidia-ai-endpoints==0.3.18
|
| 8 |
-
# dotenv==0.9.9
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
gradio
|
| 13 |
-
requests
|
| 14 |
-
# python-dotenv
|
| 15 |
-
dotenv
|
| 16 |
-
|
| 17 |
-
firecrawl-py
|
| 18 |
langchain-community
|
| 19 |
-
langchain-google-genai
|
| 20 |
-
langchain-nvidia-ai-endpoints
|
| 21 |
-
# groq
|
|
|
|
| 1 |
+
gradio==5.46.1
|
| 2 |
+
requests==2.32.5
|
| 3 |
+
dotenv==0.9.9
|
| 4 |
+
firecrawl-py==4.3.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
langchain-community
|
| 6 |
+
langchain-google-genai==2.1.12
|
| 7 |
+
langchain-nvidia-ai-endpoints==0.3.18
|
|
|