Spaces:

frkhan
/

llm-web-scrapper

Sleeping

frkhan commited on Oct 1

Commit

b3c07b5

1 Parent(s): 597eb22

-- Fixed the package versions.

-- Integrate Langfuse in scraping & remove screenshot feature

- Adds Langfuse tracing to the `scrape_website` function to provide observability for the scraping step.

- Completely removes the screenshot functionality, including UI elements, client logic, dependencies, and documentation.

- Fixes a Gradio `IndexError` on the cancel button by replacing `fn=None` with a lambda function.

Files changed (8) hide show

.env.example +4 -1
app.py +66 -19
config.py +6 -0
crawl4ai_client.py +7 -4
docker-compose.dev.yml +3 -0
docker-compose.yml +3 -1
llm_inference_service.py +22 -1
requirements.txt +5 -4

.env.example CHANGED Viewed

@@ -1,3 +1,6 @@
 GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
 NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
-FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"

 GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
 NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
+FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
+LANGFUSE_PUBLIC_KEY="pk-lf-..."
+LANGFUSE_SECRET_KEY="sk-lf-..."
+LANGFUSE_HOST="https://cloud.langfuse.com" # Or your self-hosted instance

app.py CHANGED Viewed

@@ -1,8 +1,19 @@
 import gradio as gr
 import firecrawl_client
 import crawl4ai_client
 import llm_inference_service
 def parse_model_provider(selection):
     # Expected format: "<model_name> (<provider>)"
@@ -12,23 +23,50 @@ def parse_model_provider(selection):
         return model, provider
     raise ValueError(f"Invalid selection format: {selection}")
-def llm_response_wrapper(query, scrape_result, model_provider_selection):
     model, provider = parse_model_provider(model_provider_selection)
     result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
     if not result or (isinstance(result, str) and result.strip() == ""):
-        return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
-    return result
-async def scrape_website(url, scraper_selection):
-    try:
-        if scraper_selection == "Scrape with FireCrawl":
-            return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
-        elif scraper_selection == "Scrape with Crawl4AI":
-            return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
-        else:
-            return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
-    except Exception as e:
-        return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
 #Gradio UI
 with gr.Blocks() as gradio_ui:
@@ -99,8 +137,10 @@ with gr.Blocks() as gradio_ui:
                 value="Scrape with FireCrawl"
             )
             scrape_btn = gr.Button("Scrape Website")
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
         gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
         gr.Markdown("### 🧠 LLM Extraction")
         gr.Markdown("Use a language model to extract structured information from the scraped content.")
@@ -130,7 +170,7 @@ with gr.Blocks() as gradio_ui:
     # LLM response output area and loader
     llm_response = gr.Markdown(
-        "\n" * 9,  # 9 newlines + 1 line for empty content = 10 lines minimum
         label="LLM Response",
         show_copy_button=True,
         visible=True
@@ -138,7 +178,14 @@ with gr.Blocks() as gradio_ui:
     # Removed custom loader; Gradio will show a spinner automatically during processing.
-    scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
     llm_event = llm_response_btn.click(
         fn=llm_response_wrapper,
@@ -146,6 +193,6 @@ with gr.Blocks() as gradio_ui:
         outputs=llm_response
     )
-    cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
 gradio_ui.launch(server_name="0.0.0.0")

 import gradio as gr
 import firecrawl_client
 import crawl4ai_client
 import llm_inference_service
+from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
+from langfuse import Langfuse, get_client
+# Initialize Langfuse if configured
+langfuse = None
+if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
+    Langfuse(
+        public_key=LANGFUSE_PUBLIC_KEY,
+        secret_key=LANGFUSE_SECRET_KEY,
+        host=LANGFUSE_HOST
+    )
+    langfuse = get_client()
 def parse_model_provider(selection):
     # Expected format: "<model_name> (<provider>)"
         return model, provider
     raise ValueError(f"Invalid selection format: {selection}")
+def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)):
+    yield "⏳ Generating response... Please wait."
     model, provider = parse_model_provider(model_provider_selection)
     result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
     if not result or (isinstance(result, str) and result.strip() == ""):
+        yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
+    yield result
+async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)):
+    """
+    Performs the scraping and yields Gradio component updates directly.
+    This generator pattern is the most reliable way to handle sequential UI updates.
+    """
+    # 1. First, yield an update to show the loading state and hide the old image.
+    yield "⏳ Scraping website... Please wait."
+    markdown = ""
+    if not langfuse:
+        try:
+            if scraper_selection == "Scrape with FireCrawl":
+                markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
+            elif scraper_selection == "Scrape with Crawl4AI":
+                markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
+            else:
+                markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
+        except Exception as e:
+            markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
+        yield markdown
+        return
+    with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span:
+        try:
+            if scraper_selection == "Scrape with FireCrawl":
+                markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
+            elif scraper_selection == "Scrape with Crawl4AI":
+                markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
+            else:
+                markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
+            span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"})
+        except Exception as e:
+            markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
+            span.update_trace(output={"error": str(e), "status": "Error"})
+        yield markdown
 #Gradio UI
 with gr.Blocks() as gradio_ui:
                 value="Scrape with FireCrawl"
             )
             scrape_btn = gr.Button("Scrape Website")
+            clear_btn = gr.Button("Clear")
         scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
         gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
         gr.Markdown("### 🧠 LLM Extraction")
         gr.Markdown("Use a language model to extract structured information from the scraped content.")
     # LLM response output area and loader
     llm_response = gr.Markdown(
+        "",
         label="LLM Response",
         show_copy_button=True,
         visible=True
     # Removed custom loader; Gradio will show a spinner automatically during processing.
+    scrape_event = scrape_btn.click(
+        fn=scrape_website,
+        inputs=[url_input, scraper_dropdown],
+        outputs=[scrape_result_textbox],
+    )
+    # Clear button functionality
+    clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response])
     llm_event = llm_response_btn.click(
         fn=llm_response_wrapper,
         outputs=llm_response
     )
+    cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
 gradio_ui.launch(server_name="0.0.0.0")

config.py CHANGED Viewed

@@ -21,6 +21,9 @@ if os.path.exists(env_path):
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
 FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
 if not GOOGLE_API_KEY:
     print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
@@ -30,3 +33,6 @@ if not NVIDIA_API_KEY:
 if not FIRE_CRAWL_API_KEY:
     print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")

 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
 FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
+LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
+LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
+LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com") # Default to cloud
 if not GOOGLE_API_KEY:
     print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
 if not FIRE_CRAWL_API_KEY:
     print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
+if not LANGFUSE_PUBLIC_KEY or not LANGFUSE_SECRET_KEY:
+    print("⚠️ Warning: LANGFUSE_PUBLIC_KEY or LANGFUSE_SECRET_KEY is not set. Langfuse tracing will be disabled.")

crawl4ai_client.py CHANGED Viewed

@@ -5,10 +5,13 @@ from crawl4ai import AsyncWebCrawler
 async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
     try:
         async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun(url=url)
             if result and result.markdown:
-                return result.markdown
-            # If result is None or markdown is empty
-            return "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
     except Exception as e:
         return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"

 async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
     try:
         async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url=url)
+            markdown_content = "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
             if result and result.markdown:
+                markdown_content = result.markdown
+            return markdown_content
     except Exception as e:
         return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"

docker-compose.dev.yml CHANGED Viewed

@@ -12,6 +12,9 @@ services:
       - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env in local/dev environment
       - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env in local/dev environment
       - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
     volumes:
       - .:/app:rw           # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
     restart: unless-stopped

       - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env in local/dev environment
       - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env in local/dev environment
       - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
+      - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
+      - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
+      - LANGFUSE_HOST=${LANGFUSE_HOST}
     volumes:
       - .:/app:rw           # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
     restart: unless-stopped

docker-compose.yml CHANGED Viewed

@@ -12,5 +12,7 @@ services:
       - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env or manually add the secret
       - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env or manually add the secret
       - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
     restart: unless-stopped

       - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env or manually add the secret
       - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env or manually add the secret
       - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
+      - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
+      - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
+      - LANGFUSE_HOST=${LANGFUSE_HOST}
     restart: unless-stopped

llm_inference_service.py CHANGED Viewed

@@ -1,4 +1,25 @@
 from langchain.chat_models import init_chat_model
 def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
@@ -38,6 +59,6 @@ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, mod
     """
     llm = init_chat_model(model_name, model_provider=model_provider)
-    response = llm.invoke(prompt)
     return response.content

 from langchain.chat_models import init_chat_model
+from langfuse.langchain import CallbackHandler
+from langfuse import Langfuse
+from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
+# Initialize Langfuse client
+# It is safe to do this even if keys are not set, as the handler will only be used if keys are present.
+langfuse_callback_handler = None
+callbacks = []
+if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
+    langfuse = Langfuse(
+        public_key=LANGFUSE_PUBLIC_KEY,
+        secret_key=LANGFUSE_SECRET_KEY,
+        host=LANGFUSE_HOST,
+    )
+    langfuse_callback_handler = CallbackHandler()
+    callbacks.append(langfuse_callback_handler)
 def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
     """
     llm = init_chat_model(model_name, model_provider=model_provider)
+    response = llm.invoke(prompt, config={"callbacks": callbacks})
     return response.content

requirements.txt CHANGED Viewed

@@ -1,9 +1,10 @@
 gradio==5.46.1
 requests==2.32.5
-dotenv==0.9.9
 firecrawl-py==4.3.6
-langchain-community
 langchain-google-genai==2.1.12
 langchain-nvidia-ai-endpoints==0.3.18
-asyncio
-crawl4ai

 gradio==5.46.1
 requests==2.32.5
+python-dotenv==1.1.1
 firecrawl-py==4.3.6
 langchain-google-genai==2.1.12
 langchain-nvidia-ai-endpoints==0.3.18
+Crawl4AI==0.7.4
+langfuse==3.5.2
+langchain==0.3.27
+langchain-community==0.3.30