"""
This module provides a client for interacting with the FireCrawl service.
It encapsulates the logic for scraping a website using the FireCrawlLoader from
LangChain, converting the scraped documents into a single markdown string, and
handling potential errors during the process.
"""
from langchain_community.document_loaders import FireCrawlLoader
from langchain_core.documents import Document
from config import FIRE_CRAWL_API_KEY
def scrape_with_firecrawl(url: str) -> list[Document]:
"""
Scrapes a given URL using FireCrawl and returns the content as a list of Documents.
Args:
url (str): The URL of the website to scrape.
Returns:
list[Document]: A list of LangChain Document objects, where each document
represents a scraped page.
"""
loader = FireCrawlLoader(url=url,
api_key=FIRE_CRAWL_API_KEY,
mode='scrape')
pages = []
for page in loader.lazy_load(): # type: ignore
pages.append(page)
return pages
def get_markdown_from_documents(docs: list[Document]) -> str:
"""
Converts a list of LangChain Documents into a single markdown string.
Each document's content is appended, separated by a horizontal rule.
Args:
docs (list[Document]): A list of Document objects to process.
Returns:
str: A string containing the combined content in markdown format.
"""
markdown_content = ""
for i, doc in enumerate(docs):
markdown_content += f"### Page {i+1}\n"
markdown_content += f"{doc.page_content}\n\n--------------\n\n"
return markdown_content
def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
"""
Orchestrates the scraping of a URL with FireCrawl and returns the content as markdown.
This is the main entry point function for this module. It handles the full
process of scraping, content conversion, and error handling.
Args:
url (str): The URL of the website to scrape.
Returns:
str: The scraped content in markdown format, or a formatted error message string if an issue occurs.
"""
try:
docs = scrape_with_firecrawl(url)
if not docs:
return "❌ FireCrawl completed but returned no content. The page might be empty or inaccessible."
markdown = get_markdown_from_documents(docs)
return markdown
except Exception as e:
return f"❌ An error occurred while scraping with FireCrawl: {e}"