Spaces:
Running
Running
| from spider.spider import Spider | |
| from langflow.base.langchain_utilities.spider_constants import MODES | |
| from langflow.custom import Component | |
| from langflow.io import ( | |
| BoolInput, | |
| DictInput, | |
| DropdownInput, | |
| IntInput, | |
| Output, | |
| SecretStrInput, | |
| StrInput, | |
| ) | |
| from langflow.schema import Data | |
| class SpiderTool(Component): | |
| display_name: str = "Spider Web Crawler & Scraper" | |
| description: str = "Spider API for web crawling and scraping." | |
| output_types: list[str] = ["Document"] | |
| documentation: str = "https://spider.cloud/docs/api" | |
| inputs = [ | |
| SecretStrInput( | |
| name="spider_api_key", | |
| display_name="Spider API Key", | |
| required=True, | |
| password=True, | |
| info="The Spider API Key, get it from https://spider.cloud", | |
| ), | |
| StrInput( | |
| name="url", | |
| display_name="URL", | |
| required=True, | |
| info="The URL to scrape or crawl", | |
| ), | |
| DropdownInput( | |
| name="mode", | |
| display_name="Mode", | |
| required=True, | |
| options=MODES, | |
| value=MODES[0], | |
| info="The mode of operation: scrape or crawl", | |
| ), | |
| IntInput( | |
| name="limit", | |
| display_name="Limit", | |
| info="The maximum amount of pages allowed to crawl per website. Set to 0 to crawl all pages.", | |
| advanced=True, | |
| ), | |
| IntInput( | |
| name="depth", | |
| display_name="Depth", | |
| info="The crawl limit for maximum depth. If 0, no limit will be applied.", | |
| advanced=True, | |
| ), | |
| StrInput( | |
| name="blacklist", | |
| display_name="Blacklist", | |
| info="Blacklist paths that you do not want to crawl. Use Regex patterns.", | |
| advanced=True, | |
| ), | |
| StrInput( | |
| name="whitelist", | |
| display_name="Whitelist", | |
| info="Whitelist paths that you want to crawl, ignoring all other routes. Use Regex patterns.", | |
| advanced=True, | |
| ), | |
| BoolInput( | |
| name="readability", | |
| display_name="Use Readability", | |
| info="Use readability to pre-process the content for reading.", | |
| advanced=True, | |
| ), | |
| IntInput( | |
| name="request_timeout", | |
| display_name="Request Timeout", | |
| info="Timeout for the request in seconds.", | |
| advanced=True, | |
| ), | |
| BoolInput( | |
| name="metadata", | |
| display_name="Metadata", | |
| info="Include metadata in the response.", | |
| advanced=True, | |
| ), | |
| DictInput( | |
| name="params", | |
| display_name="Additional Parameters", | |
| info="Additional parameters to pass to the API. If provided, other inputs will be ignored.", | |
| ), | |
| ] | |
| outputs = [ | |
| Output(display_name="Markdown", name="content", method="crawl"), | |
| ] | |
| def crawl(self) -> list[Data]: | |
| if self.params: | |
| parameters = self.params["data"] | |
| else: | |
| parameters = { | |
| "limit": self.limit or None, | |
| "depth": self.depth or None, | |
| "blacklist": self.blacklist or None, | |
| "whitelist": self.whitelist or None, | |
| "readability": self.readability, | |
| "request_timeout": self.request_timeout or None, | |
| "metadata": self.metadata, | |
| "return_format": "markdown", | |
| } | |
| app = Spider(api_key=self.spider_api_key) | |
| if self.mode == "scrape": | |
| parameters["limit"] = 1 | |
| result = app.scrape_url(self.url, parameters) | |
| elif self.mode == "crawl": | |
| result = app.crawl_url(self.url, parameters) | |
| else: | |
| msg = f"Invalid mode: {self.mode}. Must be 'scrape' or 'crawl'." | |
| raise ValueError(msg) | |
| records = [] | |
| for record in result: | |
| if self.metadata: | |
| records.append( | |
| Data( | |
| data={ | |
| "content": record["content"], | |
| "url": record["url"], | |
| "metadata": record["metadata"], | |
| } | |
| ) | |
| ) | |
| else: | |
| records.append(Data(data={"content": record["content"], "url": record["url"]})) | |
| return records | |
| class SpiderToolError(Exception): | |
| """SpiderTool error.""" | |