frkhan commited on
Commit
b3c07b5
·
1 Parent(s): 597eb22

-- Fixed the package versions.

Browse files

-- Integrate Langfuse in scraping & remove screenshot feature

- Adds Langfuse tracing to the `scrape_website` function to provide observability for the scraping step.

- Completely removes the screenshot functionality, including UI elements, client logic, dependencies, and documentation.

- Fixes a Gradio `IndexError` on the cancel button by replacing `fn=None` with a lambda function.

.env.example CHANGED
@@ -1,3 +1,6 @@
1
  GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
2
  NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
3
- FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
 
 
 
 
1
  GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
2
  NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
3
+ FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
4
+ LANGFUSE_PUBLIC_KEY="pk-lf-..."
5
+ LANGFUSE_SECRET_KEY="sk-lf-..."
6
+ LANGFUSE_HOST="https://cloud.langfuse.com" # Or your self-hosted instance
app.py CHANGED
@@ -1,8 +1,19 @@
1
  import gradio as gr
2
-
3
  import firecrawl_client
4
  import crawl4ai_client
5
  import llm_inference_service
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def parse_model_provider(selection):
8
  # Expected format: "<model_name> (<provider>)"
@@ -12,23 +23,50 @@ def parse_model_provider(selection):
12
  return model, provider
13
  raise ValueError(f"Invalid selection format: {selection}")
14
 
15
- def llm_response_wrapper(query, scrape_result, model_provider_selection):
 
 
16
  model, provider = parse_model_provider(model_provider_selection)
17
  result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
18
  if not result or (isinstance(result, str) and result.strip() == ""):
19
- return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
20
- return result
21
-
22
- async def scrape_website(url, scraper_selection):
23
- try:
24
- if scraper_selection == "Scrape with FireCrawl":
25
- return firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
26
- elif scraper_selection == "Scrape with Crawl4AI":
27
- return await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
28
- else:
29
- return "❌ <span style='color:red;'>Invalid scraper selected.</span>"
30
- except Exception as e:
31
- return f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  #Gradio UI
34
  with gr.Blocks() as gradio_ui:
@@ -99,8 +137,10 @@ with gr.Blocks() as gradio_ui:
99
  value="Scrape with FireCrawl"
100
  )
101
  scrape_btn = gr.Button("Scrape Website")
 
 
102
  scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
103
-
104
  gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
105
  gr.Markdown("### 🧠 LLM Extraction")
106
  gr.Markdown("Use a language model to extract structured information from the scraped content.")
@@ -130,7 +170,7 @@ with gr.Blocks() as gradio_ui:
130
 
131
  # LLM response output area and loader
132
  llm_response = gr.Markdown(
133
- "\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum
134
  label="LLM Response",
135
  show_copy_button=True,
136
  visible=True
@@ -138,7 +178,14 @@ with gr.Blocks() as gradio_ui:
138
  # Removed custom loader; Gradio will show a spinner automatically during processing.
139
 
140
 
141
- scrape_event = scrape_btn.click(fn=scrape_website, inputs=[url_input, scraper_dropdown], outputs=scrape_result_textbox)
 
 
 
 
 
 
 
142
 
143
  llm_event = llm_response_btn.click(
144
  fn=llm_response_wrapper,
@@ -146,6 +193,6 @@ with gr.Blocks() as gradio_ui:
146
  outputs=llm_response
147
  )
148
 
149
- cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
150
 
151
  gradio_ui.launch(server_name="0.0.0.0")
 
1
  import gradio as gr
 
2
  import firecrawl_client
3
  import crawl4ai_client
4
  import llm_inference_service
5
+ from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
6
+ from langfuse import Langfuse, get_client
7
+
8
+ # Initialize Langfuse if configured
9
+ langfuse = None
10
+ if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
11
+ Langfuse(
12
+ public_key=LANGFUSE_PUBLIC_KEY,
13
+ secret_key=LANGFUSE_SECRET_KEY,
14
+ host=LANGFUSE_HOST
15
+ )
16
+ langfuse = get_client()
17
 
18
  def parse_model_provider(selection):
19
  # Expected format: "<model_name> (<provider>)"
 
23
  return model, provider
24
  raise ValueError(f"Invalid selection format: {selection}")
25
 
26
+ def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)):
27
+ yield "⏳ Generating response... Please wait."
28
+
29
  model, provider = parse_model_provider(model_provider_selection)
30
  result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
31
  if not result or (isinstance(result, str) and result.strip() == ""):
32
+ yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
33
+ yield result
34
+
35
+ async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)):
36
+ """
37
+ Performs the scraping and yields Gradio component updates directly.
38
+ This generator pattern is the most reliable way to handle sequential UI updates.
39
+ """
40
+ # 1. First, yield an update to show the loading state and hide the old image.
41
+ yield "⏳ Scraping website... Please wait."
42
+
43
+ markdown = ""
44
+ if not langfuse:
45
+ try:
46
+ if scraper_selection == "Scrape with FireCrawl":
47
+ markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
48
+ elif scraper_selection == "Scrape with Crawl4AI":
49
+ markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
50
+ else:
51
+ markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
52
+ except Exception as e:
53
+ markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
54
+ yield markdown
55
+ return
56
+
57
+ with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span:
58
+ try:
59
+ if scraper_selection == "Scrape with FireCrawl":
60
+ markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
61
+ elif scraper_selection == "Scrape with Crawl4AI":
62
+ markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
63
+ else:
64
+ markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
65
+ span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"})
66
+ except Exception as e:
67
+ markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
68
+ span.update_trace(output={"error": str(e), "status": "Error"})
69
+ yield markdown
70
 
71
  #Gradio UI
72
  with gr.Blocks() as gradio_ui:
 
137
  value="Scrape with FireCrawl"
138
  )
139
  scrape_btn = gr.Button("Scrape Website")
140
+ clear_btn = gr.Button("Clear")
141
+
142
  scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
143
+
144
  gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
145
  gr.Markdown("### 🧠 LLM Extraction")
146
  gr.Markdown("Use a language model to extract structured information from the scraped content.")
 
170
 
171
  # LLM response output area and loader
172
  llm_response = gr.Markdown(
173
+ "",
174
  label="LLM Response",
175
  show_copy_button=True,
176
  visible=True
 
178
  # Removed custom loader; Gradio will show a spinner automatically during processing.
179
 
180
 
181
+ scrape_event = scrape_btn.click(
182
+ fn=scrape_website,
183
+ inputs=[url_input, scraper_dropdown],
184
+ outputs=[scrape_result_textbox],
185
+ )
186
+
187
+ # Clear button functionality
188
+ clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response])
189
 
190
  llm_event = llm_response_btn.click(
191
  fn=llm_response_wrapper,
 
193
  outputs=llm_response
194
  )
195
 
196
+ cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
197
 
198
  gradio_ui.launch(server_name="0.0.0.0")
config.py CHANGED
@@ -21,6 +21,9 @@ if os.path.exists(env_path):
21
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
22
  NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
23
  FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
 
 
 
24
 
25
  if not GOOGLE_API_KEY:
26
  print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
@@ -30,3 +33,6 @@ if not NVIDIA_API_KEY:
30
 
31
  if not FIRE_CRAWL_API_KEY:
32
  print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
 
 
 
 
21
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
22
  NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
23
  FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
24
+ LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
25
+ LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
26
+ LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com") # Default to cloud
27
 
28
  if not GOOGLE_API_KEY:
29
  print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
 
33
 
34
  if not FIRE_CRAWL_API_KEY:
35
  print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
36
+
37
+ if not LANGFUSE_PUBLIC_KEY or not LANGFUSE_SECRET_KEY:
38
+ print("⚠️ Warning: LANGFUSE_PUBLIC_KEY or LANGFUSE_SECRET_KEY is not set. Langfuse tracing will be disabled.")
crawl4ai_client.py CHANGED
@@ -5,10 +5,13 @@ from crawl4ai import AsyncWebCrawler
5
  async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
6
  try:
7
  async with AsyncWebCrawler() as crawler:
8
- result = await crawler.arun(url=url)
 
 
 
9
  if result and result.markdown:
10
- return result.markdown
11
- # If result is None or markdown is empty
12
- return "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
13
  except Exception as e:
14
  return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
 
5
  async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
6
  try:
7
  async with AsyncWebCrawler() as crawler:
8
+ result = await crawler.arun(url=url)
9
+
10
+ markdown_content = "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
11
+
12
  if result and result.markdown:
13
+ markdown_content = result.markdown
14
+
15
+ return markdown_content
16
  except Exception as e:
17
  return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
docker-compose.dev.yml CHANGED
@@ -12,6 +12,9 @@ services:
12
  - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env in local/dev environment
13
  - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env in local/dev environment
14
  - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
 
 
 
15
  volumes:
16
  - .:/app:rw # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
17
  restart: unless-stopped
 
12
  - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env in local/dev environment
13
  - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env in local/dev environment
14
  - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
15
+ - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
16
+ - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
17
+ - LANGFUSE_HOST=${LANGFUSE_HOST}
18
  volumes:
19
  - .:/app:rw # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
20
  restart: unless-stopped
docker-compose.yml CHANGED
@@ -12,5 +12,7 @@ services:
12
  - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env or manually add the secret
13
  - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env or manually add the secret
14
  - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
 
 
 
15
  restart: unless-stopped
16
-
 
12
  - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env or manually add the secret
13
  - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env or manually add the secret
14
  - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
15
+ - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
16
+ - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
17
+ - LANGFUSE_HOST=${LANGFUSE_HOST}
18
  restart: unless-stopped
 
llm_inference_service.py CHANGED
@@ -1,4 +1,25 @@
1
  from langchain.chat_models import init_chat_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
@@ -38,6 +59,6 @@ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, mod
38
  """
39
 
40
  llm = init_chat_model(model_name, model_provider=model_provider)
41
- response = llm.invoke(prompt)
42
  return response.content
43
 
 
1
  from langchain.chat_models import init_chat_model
2
+ from langfuse.langchain import CallbackHandler
3
+ from langfuse import Langfuse
4
+
5
+ from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
6
+
7
+ # Initialize Langfuse client
8
+ # It is safe to do this even if keys are not set, as the handler will only be used if keys are present.
9
+ langfuse_callback_handler = None
10
+ callbacks = []
11
+
12
+ if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
13
+ langfuse = Langfuse(
14
+ public_key=LANGFUSE_PUBLIC_KEY,
15
+ secret_key=LANGFUSE_SECRET_KEY,
16
+ host=LANGFUSE_HOST,
17
+ )
18
+
19
+ langfuse_callback_handler = CallbackHandler()
20
+
21
+ callbacks.append(langfuse_callback_handler)
22
+
23
 
24
 
25
  def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
 
59
  """
60
 
61
  llm = init_chat_model(model_name, model_provider=model_provider)
62
+ response = llm.invoke(prompt, config={"callbacks": callbacks})
63
  return response.content
64
 
requirements.txt CHANGED
@@ -1,9 +1,10 @@
1
  gradio==5.46.1
2
  requests==2.32.5
3
- dotenv==0.9.9
4
  firecrawl-py==4.3.6
5
- langchain-community
6
  langchain-google-genai==2.1.12
7
  langchain-nvidia-ai-endpoints==0.3.18
8
- asyncio
9
- crawl4ai
 
 
 
1
  gradio==5.46.1
2
  requests==2.32.5
3
+ python-dotenv==1.1.1
4
  firecrawl-py==4.3.6
 
5
  langchain-google-genai==2.1.12
6
  langchain-nvidia-ai-endpoints==0.3.18
7
+ Crawl4AI==0.7.4
8
+ langfuse==3.5.2
9
+ langchain==0.3.27
10
+ langchain-community==0.3.30