File size: 7,426 Bytes
483c169
 
 
255e074
483c169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255e074
 
 
 
 
 
 
 
 
 
 
483c169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255e074
483c169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e4cb4f
 
 
 
 
 
 
 
 
 
 
255e074
 
 
 
 
 
 
2e4cb4f
 
 
483c169
 
 
 
 
 
255e074
 
 
 
 
 
 
483c169
 
2e4cb4f
 
 
 
 
483c169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e4cb4f
255e074
483c169
 
 
 
 
 
 
 
 
 
 
 
255e074
483c169
255e074
483c169
 
 
 
255e074
 
483c169
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr

import firecrawl_client
import crawl4ai_client
import llm_inference_service

def parse_model_provider(selection):
    # Expected format: "<model_name> (<provider>)"
    if "(" in selection and ")" in selection:
        model = selection.split(" (")[0].strip()
        provider = selection.split(" (")[1].replace(")", "").strip()
        return model, provider
    raise ValueError(f"Invalid selection format: {selection}")
    
def llm_response_wrapper(query, scrape_result, model_provider_selection):
    model, provider = parse_model_provider(model_provider_selection)
    result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
    if not result or (isinstance(result, str) and result.strip() == ""):
        return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
    return result

async def scrape_website(url, scraper_selection):
    try:
        if scraper_selection == "Scrape with FireCrawl":
            return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
        elif scraper_selection == "Scrape with Crawl4AI":
            return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
        else:
            return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
    except Exception as e:
        return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"

#Gradio UI
with gr.Blocks() as gradio_ui:
    gr.HTML("""
    <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
        <h1 style="margin: 0;"> LLM Web Scraper</h1>
        <div style="display: flex; gap: 10px;">
            <a href="https://github.com/langchain-ai/langchain" target="_blank">
                <img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
            </a>
            <a href="https://ai.google.dev/gemini-api/docs" target="_blank">
                <img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
            </a>
            <a href="https://build.nvidia.com/models" target="_blank">
                <img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
            </a>
            <a href="https://firecrawl.dev/" target="_blank">
                <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
            </a>
            <a href="https://docs.crawl4ai.com/" target="_blank">
                <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
            </a>

        </div>
    </div>
    """)
    
    gr.HTML("""
    <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
        <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
        <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
            <img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
        </a>
    </div>
    """)

    gr.HTML("""
    <div style="margin-bottom: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
        <h2 style="margin-top: 0;">How to Use This App</h2>
        <p>This app combines web scraping with the power of Large Language Models (LLMs) to extract specific information from web pages. Here's how it works:</p>
        <ol>
            <li><strong>Enter a URL:</strong> Provide the URL of the web page you want to analyze.</li>
            <li><strong>Define Your Query:</strong> Specify the exact information you're looking for (e.g., product name, price, customer ratings).</li>
            <li><strong>Scrape the Web Page:</strong> Click the "Scrape with FireCrawl" button to extract the content of the page.</li>
            <li><strong>Select Model & Provider:</strong> Choose the LLM model you want to use for information extraction.</li>
            <li><strong>Extract Info by LLM:</strong> Click the "Extract Info by LLM" button to get the information based on your query.</li>
        </ol>
        
        <br />
        <br />
        
        <p><strong>What makes this different from a regular web scraper?</strong>  </p>
    
        <p>Traditional web scrapers require pre-programming to extract product data for each specific website. These scrapers are brittle and can break if the website's design changes. This app uses LLMs to <em>understand</em> your query and extract only the relevant information, saving you time and effort and removing the need for constant maintenance.</p>
    </div>
    """)
    
    
    with gr.Column():
        url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
        # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
        query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
        
        with gr.Row():
            scraper_dropdown = gr.Dropdown(
                label="Select Scraper",
                choices=["Scrape with FireCrawl", "Scrape with Crawl4AI"],
                value="Scrape with FireCrawl"
            )
            scrape_btn = gr.Button("Scrape Website")
        scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
        
        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
        gr.Markdown("### 🧠 LLM Extraction")
        gr.Markdown("Use a language model to extract structured information from the scraped content.")
        gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")

    
    with gr.Row():
        
        # Add a single dropdown for model and provider selection
        model_provider_dropdown = gr.Dropdown(
            label="Select Model & Provider",
            choices=[
            "gemini-2.5-flash-lite (google_genai)",
            "gemini-2.5-pro (google_genai)",
            "gemini-2.5-flash (google_genai)",
            "bytedance/seed-oss-36b-instruct (nvidia)",
            "deepseek-ai/deepseek-v3.1 (nvidia)",
            "qwen/qwen3-next-80b-a3b-instruct (nvidia)",
            ],
            value="gemini-2.5-flash-lite (google_genai)"
        )
        
        
        llm_response_btn = gr.Button("Extract Info by LLM")
        cancel_btn = gr.Button("Cancel", variant="stop")
        

    # LLM response output area and loader
    llm_response = gr.Markdown(
        "\n" * 9,  # 9 newlines + 1 line for empty content = 10 lines minimum
        label="LLM Response",
        show_copy_button=True,
        visible=True
    )
    # Removed custom loader; Gradio will show a spinner automatically during processing.


    scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)

    llm_event = llm_response_btn.click(
        fn=llm_response_wrapper,
        inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
        outputs=llm_response
    )
    
    cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])

gradio_ui.launch(server_name="0.0.0.0")