Spaces:
Sleeping
Sleeping
-- Fixed the package versions.
Browse files-- Integrate Langfuse in scraping & remove screenshot feature
- Adds Langfuse tracing to the `scrape_website` function to provide observability for the scraping step.
- Completely removes the screenshot functionality, including UI elements, client logic, dependencies, and documentation.
- Fixes a Gradio `IndexError` on the cancel button by replacing `fn=None` with a lambda function.
- .env.example +4 -1
- app.py +66 -19
- config.py +6 -0
- crawl4ai_client.py +7 -4
- docker-compose.dev.yml +3 -0
- docker-compose.yml +3 -1
- llm_inference_service.py +22 -1
- requirements.txt +5 -4
.env.example
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
|
| 2 |
NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
|
| 3 |
-
FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
|
| 2 |
NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
|
| 3 |
+
FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
|
| 4 |
+
LANGFUSE_PUBLIC_KEY="pk-lf-..."
|
| 5 |
+
LANGFUSE_SECRET_KEY="sk-lf-..."
|
| 6 |
+
LANGFUSE_HOST="https://cloud.langfuse.com" # Or your self-hosted instance
|
app.py
CHANGED
|
@@ -1,8 +1,19 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
import firecrawl_client
|
| 4 |
import crawl4ai_client
|
| 5 |
import llm_inference_service
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def parse_model_provider(selection):
|
| 8 |
# Expected format: "<model_name> (<provider>)"
|
|
@@ -12,23 +23,50 @@ def parse_model_provider(selection):
|
|
| 12 |
return model, provider
|
| 13 |
raise ValueError(f"Invalid selection format: {selection}")
|
| 14 |
|
| 15 |
-
def llm_response_wrapper(query, scrape_result, model_provider_selection):
|
|
|
|
|
|
|
| 16 |
model, provider = parse_model_provider(model_provider_selection)
|
| 17 |
result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
|
| 18 |
if not result or (isinstance(result, str) and result.strip() == ""):
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
async def scrape_website(url, scraper_selection):
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
#Gradio UI
|
| 34 |
with gr.Blocks() as gradio_ui:
|
|
@@ -99,8 +137,10 @@ with gr.Blocks() as gradio_ui:
|
|
| 99 |
value="Scrape with FireCrawl"
|
| 100 |
)
|
| 101 |
scrape_btn = gr.Button("Scrape Website")
|
|
|
|
|
|
|
| 102 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 103 |
-
|
| 104 |
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
| 105 |
gr.Markdown("### 🧠 LLM Extraction")
|
| 106 |
gr.Markdown("Use a language model to extract structured information from the scraped content.")
|
|
@@ -130,7 +170,7 @@ with gr.Blocks() as gradio_ui:
|
|
| 130 |
|
| 131 |
# LLM response output area and loader
|
| 132 |
llm_response = gr.Markdown(
|
| 133 |
-
"
|
| 134 |
label="LLM Response",
|
| 135 |
show_copy_button=True,
|
| 136 |
visible=True
|
|
@@ -138,7 +178,14 @@ with gr.Blocks() as gradio_ui:
|
|
| 138 |
# Removed custom loader; Gradio will show a spinner automatically during processing.
|
| 139 |
|
| 140 |
|
| 141 |
-
scrape_event = scrape_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
llm_event = llm_response_btn.click(
|
| 144 |
fn=llm_response_wrapper,
|
|
@@ -146,6 +193,6 @@ with gr.Blocks() as gradio_ui:
|
|
| 146 |
outputs=llm_response
|
| 147 |
)
|
| 148 |
|
| 149 |
-
cancel_btn.click(fn=None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
|
| 150 |
|
| 151 |
gradio_ui.launch(server_name="0.0.0.0")
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import firecrawl_client
|
| 3 |
import crawl4ai_client
|
| 4 |
import llm_inference_service
|
| 5 |
+
from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
|
| 6 |
+
from langfuse import Langfuse, get_client
|
| 7 |
+
|
| 8 |
+
# Initialize Langfuse if configured
|
| 9 |
+
langfuse = None
|
| 10 |
+
if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
|
| 11 |
+
Langfuse(
|
| 12 |
+
public_key=LANGFUSE_PUBLIC_KEY,
|
| 13 |
+
secret_key=LANGFUSE_SECRET_KEY,
|
| 14 |
+
host=LANGFUSE_HOST
|
| 15 |
+
)
|
| 16 |
+
langfuse = get_client()
|
| 17 |
|
| 18 |
def parse_model_provider(selection):
|
| 19 |
# Expected format: "<model_name> (<provider>)"
|
|
|
|
| 23 |
return model, provider
|
| 24 |
raise ValueError(f"Invalid selection format: {selection}")
|
| 25 |
|
| 26 |
+
def llm_response_wrapper(query, scrape_result, model_provider_selection, progress=gr.Progress(track_tqdm=True)):
|
| 27 |
+
yield "⏳ Generating response... Please wait."
|
| 28 |
+
|
| 29 |
model, provider = parse_model_provider(model_provider_selection)
|
| 30 |
result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
|
| 31 |
if not result or (isinstance(result, str) and result.strip() == ""):
|
| 32 |
+
yield "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
|
| 33 |
+
yield result
|
| 34 |
+
|
| 35 |
+
async def scrape_website(url, scraper_selection, progress=gr.Progress(track_tqdm=True)):
|
| 36 |
+
"""
|
| 37 |
+
Performs the scraping and yields Gradio component updates directly.
|
| 38 |
+
This generator pattern is the most reliable way to handle sequential UI updates.
|
| 39 |
+
"""
|
| 40 |
+
# 1. First, yield an update to show the loading state and hide the old image.
|
| 41 |
+
yield "⏳ Scraping website... Please wait."
|
| 42 |
+
|
| 43 |
+
markdown = ""
|
| 44 |
+
if not langfuse:
|
| 45 |
+
try:
|
| 46 |
+
if scraper_selection == "Scrape with FireCrawl":
|
| 47 |
+
markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
|
| 48 |
+
elif scraper_selection == "Scrape with Crawl4AI":
|
| 49 |
+
markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
|
| 50 |
+
else:
|
| 51 |
+
markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
|
| 52 |
+
except Exception as e:
|
| 53 |
+
markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
|
| 54 |
+
yield markdown
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
with langfuse.start_as_current_span(name="web-scraping", input={"url": url, "scraper": scraper_selection}) as span:
|
| 58 |
+
try:
|
| 59 |
+
if scraper_selection == "Scrape with FireCrawl":
|
| 60 |
+
markdown = firecrawl_client.scrape_and_get_markdown_with_firecrawl(url)
|
| 61 |
+
elif scraper_selection == "Scrape with Crawl4AI":
|
| 62 |
+
markdown = await crawl4ai_client.scrape_and_get_markdown_with_crawl4ai(url)
|
| 63 |
+
else:
|
| 64 |
+
markdown = "❌ <span style='color:red;'>Invalid scraper selected.</span>"
|
| 65 |
+
span.update_trace(output={"markdown_char_count": len(markdown), "status": "Success"})
|
| 66 |
+
except Exception as e:
|
| 67 |
+
markdown = f"❌ <span style='color:red;'>An unexpected error occurred: {e}</span>"
|
| 68 |
+
span.update_trace(output={"error": str(e), "status": "Error"})
|
| 69 |
+
yield markdown
|
| 70 |
|
| 71 |
#Gradio UI
|
| 72 |
with gr.Blocks() as gradio_ui:
|
|
|
|
| 137 |
value="Scrape with FireCrawl"
|
| 138 |
)
|
| 139 |
scrape_btn = gr.Button("Scrape Website")
|
| 140 |
+
clear_btn = gr.Button("Clear")
|
| 141 |
+
|
| 142 |
scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
|
| 143 |
+
|
| 144 |
gr.HTML("<hr style='margin-top:10px; margin-bottom:10px;'>")
|
| 145 |
gr.Markdown("### 🧠 LLM Extraction")
|
| 146 |
gr.Markdown("Use a language model to extract structured information from the scraped content.")
|
|
|
|
| 170 |
|
| 171 |
# LLM response output area and loader
|
| 172 |
llm_response = gr.Markdown(
|
| 173 |
+
"",
|
| 174 |
label="LLM Response",
|
| 175 |
show_copy_button=True,
|
| 176 |
visible=True
|
|
|
|
| 178 |
# Removed custom loader; Gradio will show a spinner automatically during processing.
|
| 179 |
|
| 180 |
|
| 181 |
+
scrape_event = scrape_btn.click(
|
| 182 |
+
fn=scrape_website,
|
| 183 |
+
inputs=[url_input, scraper_dropdown],
|
| 184 |
+
outputs=[scrape_result_textbox],
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Clear button functionality
|
| 188 |
+
clear_btn.click(lambda: ("", "", "", ""), outputs=[url_input, query_input, scrape_result_textbox, llm_response])
|
| 189 |
|
| 190 |
llm_event = llm_response_btn.click(
|
| 191 |
fn=llm_response_wrapper,
|
|
|
|
| 193 |
outputs=llm_response
|
| 194 |
)
|
| 195 |
|
| 196 |
+
cancel_btn.click(fn=lambda: None, inputs=None, outputs=None, cancels=[scrape_event, llm_event])
|
| 197 |
|
| 198 |
gradio_ui.launch(server_name="0.0.0.0")
|
config.py
CHANGED
|
@@ -21,6 +21,9 @@ if os.path.exists(env_path):
|
|
| 21 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 22 |
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
|
| 23 |
FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
if not GOOGLE_API_KEY:
|
| 26 |
print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
|
|
@@ -30,3 +33,6 @@ if not NVIDIA_API_KEY:
|
|
| 30 |
|
| 31 |
if not FIRE_CRAWL_API_KEY:
|
| 32 |
print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 22 |
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
|
| 23 |
FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
| 24 |
+
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
|
| 25 |
+
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
|
| 26 |
+
LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com") # Default to cloud
|
| 27 |
|
| 28 |
if not GOOGLE_API_KEY:
|
| 29 |
print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
|
|
|
|
| 33 |
|
| 34 |
if not FIRE_CRAWL_API_KEY:
|
| 35 |
print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
|
| 36 |
+
|
| 37 |
+
if not LANGFUSE_PUBLIC_KEY or not LANGFUSE_SECRET_KEY:
|
| 38 |
+
print("⚠️ Warning: LANGFUSE_PUBLIC_KEY or LANGFUSE_SECRET_KEY is not set. Langfuse tracing will be disabled.")
|
crawl4ai_client.py
CHANGED
|
@@ -5,10 +5,13 @@ from crawl4ai import AsyncWebCrawler
|
|
| 5 |
async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
|
| 6 |
try:
|
| 7 |
async with AsyncWebCrawler() as crawler:
|
| 8 |
-
result = await crawler.arun(url=url)
|
|
|
|
|
|
|
|
|
|
| 9 |
if result and result.markdown:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
return
|
| 13 |
except Exception as e:
|
| 14 |
return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
|
|
|
|
| 5 |
async def scrape_and_get_markdown_with_crawl4ai(url: str) -> str:
|
| 6 |
try:
|
| 7 |
async with AsyncWebCrawler() as crawler:
|
| 8 |
+
result = await crawler.arun(url=url)
|
| 9 |
+
|
| 10 |
+
markdown_content = "❌ <span style='color:red;'>Crawl4AI completed but returned no content. The page might be empty or inaccessible.</span>"
|
| 11 |
+
|
| 12 |
if result and result.markdown:
|
| 13 |
+
markdown_content = result.markdown
|
| 14 |
+
|
| 15 |
+
return markdown_content
|
| 16 |
except Exception as e:
|
| 17 |
return f"❌ <span style='color:red;'>An error occurred while scraping with Crawl4AI: {e}</span>"
|
docker-compose.dev.yml
CHANGED
|
@@ -12,6 +12,9 @@ services:
|
|
| 12 |
- NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env in local/dev environment
|
| 13 |
- GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env in local/dev environment
|
| 14 |
- FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
|
|
|
|
|
|
|
|
|
|
| 15 |
volumes:
|
| 16 |
- .:/app:rw # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
|
| 17 |
restart: unless-stopped
|
|
|
|
| 12 |
- NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env in local/dev environment
|
| 13 |
- GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env in local/dev environment
|
| 14 |
- FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
|
| 15 |
+
- LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
|
| 16 |
+
- LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
|
| 17 |
+
- LANGFUSE_HOST=${LANGFUSE_HOST}
|
| 18 |
volumes:
|
| 19 |
- .:/app:rw # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
|
| 20 |
restart: unless-stopped
|
docker-compose.yml
CHANGED
|
@@ -12,5 +12,7 @@ services:
|
|
| 12 |
- NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env or manually add the secret
|
| 13 |
- GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env or manually add the secret
|
| 14 |
- FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
|
|
|
|
|
|
|
|
|
|
| 15 |
restart: unless-stopped
|
| 16 |
-
|
|
|
|
| 12 |
- NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env or manually add the secret
|
| 13 |
- GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env or manually add the secret
|
| 14 |
- FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
|
| 15 |
+
- LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
|
| 16 |
+
- LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
|
| 17 |
+
- LANGFUSE_HOST=${LANGFUSE_HOST}
|
| 18 |
restart: unless-stopped
|
|
|
llm_inference_service.py
CHANGED
|
@@ -1,4 +1,25 @@
|
|
| 1 |
from langchain.chat_models import init_chat_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
|
|
@@ -38,6 +59,6 @@ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, mod
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
llm = init_chat_model(model_name, model_provider=model_provider)
|
| 41 |
-
response = llm.invoke(prompt)
|
| 42 |
return response.content
|
| 43 |
|
|
|
|
| 1 |
from langchain.chat_models import init_chat_model
|
| 2 |
+
from langfuse.langchain import CallbackHandler
|
| 3 |
+
from langfuse import Langfuse
|
| 4 |
+
|
| 5 |
+
from config import LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST
|
| 6 |
+
|
| 7 |
+
# Initialize Langfuse client
|
| 8 |
+
# It is safe to do this even if keys are not set, as the handler will only be used if keys are present.
|
| 9 |
+
langfuse_callback_handler = None
|
| 10 |
+
callbacks = []
|
| 11 |
+
|
| 12 |
+
if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
|
| 13 |
+
langfuse = Langfuse(
|
| 14 |
+
public_key=LANGFUSE_PUBLIC_KEY,
|
| 15 |
+
secret_key=LANGFUSE_SECRET_KEY,
|
| 16 |
+
host=LANGFUSE_HOST,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
langfuse_callback_handler = CallbackHandler()
|
| 20 |
+
|
| 21 |
+
callbacks.append(langfuse_callback_handler)
|
| 22 |
+
|
| 23 |
|
| 24 |
|
| 25 |
def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
|
|
|
|
| 59 |
"""
|
| 60 |
|
| 61 |
llm = init_chat_model(model_name, model_provider=model_provider)
|
| 62 |
+
response = llm.invoke(prompt, config={"callbacks": callbacks})
|
| 63 |
return response.content
|
| 64 |
|
requirements.txt
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
gradio==5.46.1
|
| 2 |
requests==2.32.5
|
| 3 |
-
dotenv==
|
| 4 |
firecrawl-py==4.3.6
|
| 5 |
-
langchain-community
|
| 6 |
langchain-google-genai==2.1.12
|
| 7 |
langchain-nvidia-ai-endpoints==0.3.18
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 1 |
gradio==5.46.1
|
| 2 |
requests==2.32.5
|
| 3 |
+
python-dotenv==1.1.1
|
| 4 |
firecrawl-py==4.3.6
|
|
|
|
| 5 |
langchain-google-genai==2.1.12
|
| 6 |
langchain-nvidia-ai-endpoints==0.3.18
|
| 7 |
+
Crawl4AI==0.7.4
|
| 8 |
+
langfuse==3.5.2
|
| 9 |
+
langchain==0.3.27
|
| 10 |
+
langchain-community==0.3.30
|