File size: 12,729 Bytes
9536c67
 
 
 
 
 
 
 
483c169
 
255e074
483c169
b3c07b5
 
 
 
 
 
 
 
 
 
 
 
483c169
 
9536c67
 
 
 
 
 
 
 
 
 
 
 
 
 
483c169
 
 
 
 
 
b3c07b5
9536c67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3c07b5
 
483c169
 
 
b3c07b5
 
 
 
9536c67
 
 
 
 
 
 
 
 
 
 
 
 
b3c07b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255e074
483c169
9536c67
483c169
 
 
 
 
4886f5d
 
483c169
 
4886f5d
483c169
 
4886f5d
483c169
 
4886f5d
483c169
255e074
4886f5d
483c169
9cfeed9
4886f5d
9cfeed9
 
4886f5d
9cfeed9
483c169
 
 
 
 
 
 
 
 
4886f5d
483c169
 
 
 
22fa711
 
 
 
 
 
 
 
 
df26504
 
 
 
 
 
 
 
 
 
 
 
 
2e4cb4f
483c169
 
8eba581
 
 
 
 
 
 
 
 
df26504
cd825a9
483c169
255e074
 
 
9536c67
 
255e074
 
b3c07b5
 
483c169
b3c07b5
2e4cb4f
 
 
 
 
483c169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e4cb4f
255e074
483c169
 
 
 
b3c07b5
483c169
 
 
 
 
 
 
b3c07b5
 
 
 
 
 
 
 
483c169
255e074
483c169
 
 
 
255e074
b3c07b5
483c169
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
"""
This module sets up and runs the Gradio web interface for the LLM Web Scraper application.

It orchestrates the UI components, event handling for scraping and LLM extraction,
and integrates with backend services for scraping (FireCrawl, Crawl4AI) and
LLM inference. It also initializes and uses Langfuse for tracing application performance.
"""

import gradio as gr
import firecrawl_client
import crawl4ai_client
import llm_inference_service
from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
from langfuse import Langfuse, get_client

# Initialize Langfuse if configured
langfuse = None
if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
    Langfuse(
        public_key=LANGFUSE_PUBLIC_KEY, 
        secret_key=LANGFUSE_SECRET_KEY, 
        host=LANGFUSE_HOST
    )
    langfuse = get_client()

def parse_model_provider(selection):
    """
    Parses a model and provider from a selection string.

    The expected format is "<model_name> (<provider>)".

    Args:
        selection (str): The string to parse.

    Returns:
        tuple[str, str]: A tuple containing the model name and provider.

    Raises:
        ValueError: If the selection string is not in the expected format.
    """
    if "(" in selection and ")" in selection:
        model = selection.split(" (")[0].strip()
        provider = selection.split(" (")[1].replace(")", "").strip()
        return model, provider
    raise ValueError(f"Invalid selection format: {selection}")
    
def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)):
    """
    A generator function that wraps the LLM inference call for the Gradio UI.

    It yields an initial status message, calls the LLM service to extract information,
    and then yields the final result or an error message.

    Args:
        query (str): The user's query for information extraction.
        scrape_result (str): The scraped markdown content from the website.
        model_provider_selection (str): The selected model and provider string.
        progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).

    Yields:
        str: Status messages and the final LLM response as a markdown string.
    """
    yield "⏳ Generating response... Please wait."
    
    model, provider = parse_model_provider(model_provider_selection)
    result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
    if not result or (isinstance(result, str) and result.strip() == ""):
        yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
    yield result

async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)):
    """An async generator that scrapes a website based on user selection for the Gradio UI.

    This function yields an initial status message, then performs the web scraping
    using the selected tool (FireCrawl or Crawl4AI). If Langfuse is configured,
    it wraps the scraping operation in a trace for observability.

    Args:
        url (str): The URL of the website to scrape.
        scraper_selection (str): The scraping tool selected by the user.
        progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).

    Yields:
        str: A status message, followed by the scraped markdown content or an error message.
    """
    # 1. First, yield an update to show the loading state and hide the old image.
    yield "⏳ Scraping website... Please wait."

    markdown = ""
    if not langfuse:
        try:
            if scraper_selection == "Scrape with FireCrawl":
                markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
            elif scraper_selection == "Scrape with Crawl4AI":
                markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
            else:
                markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
        except Exception as e:
            markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
        yield markdown
        return

    with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span:
        try:
            if scraper_selection == "Scrape with FireCrawl":
                markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
            elif scraper_selection == "Scrape with Crawl4AI":
                markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
            else:
                markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
            span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"})
        except Exception as e:
            markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
            span.update_trace(output={"error": str(e), "status": "Error"})
        yield markdown

#Gradio UI
# This block defines the entire Gradio user interface, including layout and component interactions.
with gr.Blocks() as gradio_ui:
    gr.HTML("""
    <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
        <h1 style="margin: 0;"> LLM Web Scraper</h1>
        <div style="display: flex; gap: 10px;">
            <a href="https://www.langchain.com/" target="_blank">
                <img src="https://img.shields.io/badge/LangChain-blue?style=for-the-badge&logo=langchain" alt="LangChain">
            </a>
            <a href="https://ai.google.dev/gemini-api/docs" target="_blank">
                <img src="https://img.shields.io/badge/Gemini-white?style=for-the-badge&logo=google-gemini" alt="Gemini API">
            </a>
            <a href="https://build.nvidia.com/models" target="_blank">
                <img src="https://img.shields.io/badge/NVIDIA-gray?style=for-the-badge&logo=nvidia" alt="NVIDIA NIM">
            </a>
            <a href="https://firecrawl.dev/" target="_blank">
                <img src="https://img.shields.io/badge/FireCrawl-orange?style=for-the-badge&logo=fire" alt="FireCrawl">
            </a>
            <a href="https://docs.crawl4ai.com/" target="_blank">
                <img src="https://img.shields.io/badge/Crawl4AI-blueviolet?style=for-the-badge&logo=github" alt="Crawl4AI">
            </a>
            <a href="https://playwright.dev/" target="_blank">
                <img src="https://img.shields.io/badge/Playwright-brightgreen?style=for-the-badge&logo=playwright" alt="Playwright">
            </a>
            <a href="https://langfuse.com/" target="_blank">
                <img src="https://img.shields.io/badge/Langfuse-blue?style=for-the-badge&logo=langfuse" alt="Langfuse">
            </a>

        </div>
    </div>
    """)
    
    gr.HTML("""
    <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
        <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
        <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
            <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
        </a>
    </div>
    """)

    gr.HTML("""
    <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
        <span style="font-size: 16px;">📖 <strong>Read the full story:</strong></span>
        <a href="https://medium.com/@frkhan/from-broken-selectors-to-intelligent-scraping-a-journey-into-llm-powered-web-automation-fc76d5fe2dbc" target="_blank">
            <img src="https://img.shields.io/badge/Medium-Read%20Story-black?style=for-the-badge&logo=medium" alt="Read Story on Medium">
        </a>
    </div>
    """)

    with gr.Accordion("ℹ️ How to Use This App", open=False):
        gr.Markdown("""
        This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:
        1.  **Enter a URL:** Provide the URL of the web page you want to analyze.
        2.  **Define Your Query:** Specify the exact information you're looking for (e.g., product name, price, customer ratings).
        3.  **Scrape the Web Page:** Choose a scraper and click the "Scrape Website" button to extract the content of the page.
        4.  **Select Model & Provider:** Choose the LLM model you want to use for information extraction.
        5.  **Extract Info by LLM:** Click the "Extract Info by LLM" button to get the information based on your query.

        ---
        **What makes this different from a regular web scraper?**
        Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.
        """)
    
    
    with gr.Column():
        gr.HTML("""
        <div style="padding: 12px; border: 1px solid #d32f2f; background-color: #ffebee; border-radius: 8px; margin-bottom: 15px;">
            <p style="margin: 0; color: #c62828; font-weight: 500;">
                ⚠️ <code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">Disclaimer:</code>.  Please be responsible when scraping websites. Users must comply with the terms of service of any website they scrape and respect 
                <code style="background-color: #ffcdd2; color: #c62828; padding: 2px 5px; border-radius: 4px; font-weight: 600;">robots.txt</code>. 
                The developers of this tool are not liable for any misuse.
            </p>
        </div>
        """)
        url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", autofocus=True)
        query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating etc. / Summarize the content of this page")
        
        with gr.Row():
            scraper_dropdown = gr.Dropdown(
                label="Select Scraper",
                choices=["Scrape with Crawl4AI", "Scrape with FireCrawl"],
                value="Scrape with Crawl4AI"
            )
            scrape_btn = gr.Button("Scrape Website")
            clear_btn = gr.Button("Clear")

        scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)

        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
        gr.Markdown("### 🧠 LLM Extraction")
        gr.Markdown("Use a language model to extract structured information from the scraped content.")
        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")

    
    with gr.Row():
        
        # Add a single dropdown for model and provider selection
        model_provider_dropdown = gr.Dropdown(
            label="Select Model & Provider",
            choices=[
            "gemini-2.5-flash-lite (google_genai)",
            "gemini-2.5-pro (google_genai)",
            "gemini-2.5-flash (google_genai)",
            "bytedance/seed-oss-36b-instruct (nvidia)",
            "deepseek-ai/deepseek-v3.1 (nvidia)",
            "qwen/qwen3-next-80b-a3b-instruct (nvidia)",
            ],
            value="gemini-2.5-flash-lite (google_genai)"
        )
        
        
        llm_response_btn = gr.Button("Extract Info by LLM")
        cancel_btn = gr.Button("Cancel", variant="stop")
        

    # LLM response output area and loader
    llm_response = gr.Markdown(
        "",
        label="LLM Response",
        show_copy_button=True,
        visible=True
    )
    # Removed custom loader; Gradio will show a spinner automatically during processing.


    scrape_event = scrape_btn.click(
        fn=scrape_website, 
        inputs=[url_input, scraper_dropdown], 
        outputs=[scrape_result_textbox],
    )

    # Clear button functionality
    clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response])

    llm_event = llm_response_btn.click(
        fn=llm_response_wrapper,
        inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
        outputs=llm_response
    )
    
    cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])

gradio_ui.launch(server_name="0.0.0.0")