Spaces:
Sleeping
Sleeping
| """ | |
| Configuration settings for the web crawler | |
| """ | |
| import os | |
| from typing import Dict, List, Any, Optional | |
| # General settings | |
| MAX_WORKERS = 100 # Maximum number of worker threads/processes | |
| MAX_DEPTH = 10 # Maximum depth to crawl from seed URLs | |
| CRAWL_TIMEOUT = 30 # Timeout for HTTP requests in seconds | |
| USER_AGENT = "Mozilla/5.0 WebCrawler/1.0 (+https://example.org/bot)" | |
| # Politeness settings | |
| ROBOTSTXT_OBEY = True # Whether to obey robots.txt rules | |
| DOWNLOAD_DELAY = 1.0 # Delay between requests to the same domain (seconds) | |
| MAX_REQUESTS_PER_DOMAIN = 10 # Maximum concurrent requests per domain | |
| RESPECT_CRAWL_DELAY = True # Respect Crawl-delay in robots.txt | |
| RETRY_TIMES = 3 # Number of retries for failed requests | |
| RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429] # HTTP codes to retry | |
| # URL settings | |
| ALLOWED_DOMAINS: Optional[List[str]] = None # Domains to restrict crawling to (None = all domains) | |
| EXCLUDED_DOMAINS: List[str] = [] # Domains to exclude from crawling | |
| ALLOWED_SCHEMES = ["http", "https"] # URL schemes to allow | |
| URL_FILTERS = [ | |
| # Only filter out binary and media files | |
| r".*\.(jpg|jpeg|gif|png|ico|mp3|mp4|wav|avi|mov|mpeg|pdf|zip|rar|gz|exe|dmg|pkg|iso|bin)$", | |
| ] # Regex patterns to filter out URLs | |
| # Storage settings | |
| MONGODB_URI = "mongodb://localhost:27017/" | |
| MONGODB_DB = "webcrawler" | |
| REDIS_URI = "redis://localhost:6379/0" | |
| STORAGE_PATH = os.path.join(os.path.dirname(__file__), "storage") | |
| HTML_STORAGE_PATH = os.path.join(STORAGE_PATH, "html") | |
| LOG_PATH = os.path.join(STORAGE_PATH, "logs") | |
| # Frontier settings | |
| FRONTIER_QUEUE_SIZE = 100000 # Maximum number of URLs in the frontier queue | |
| PRIORITY_QUEUE_NUM = 5 # Number of priority queues | |
| HOST_QUEUE_NUM = 1000 # Number of host queues for politeness | |
| # Content settings | |
| MAX_CONTENT_SIZE = 10 * 1024 * 1024 # Maximum size of HTML content to download (10MB) | |
| ALLOWED_CONTENT_TYPES = [ | |
| "text/html", | |
| "application/xhtml+xml", | |
| "text/plain", # Some servers might serve HTML as text/plain | |
| "application/html", | |
| "*/*", # Accept any content type | |
| ] # Allowed content types | |
| # DNS settings | |
| DNS_CACHE_SIZE = 10000 # Maximum number of entries in DNS cache | |
| DNS_CACHE_TIMEOUT = 3600 # DNS cache timeout in seconds | |
| # Logging settings | |
| LOG_LEVEL = "INFO" | |
| LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s" | |
| # Seed URLs | |
| SEED_URLS = [ | |
| "https://en.wikipedia.org/", | |
| "https://www.nytimes.com/", | |
| "https://www.bbc.com/", | |
| "https://www.github.com/", | |
| "https://www.reddit.com/", | |
| ] | |
| # Override settings with environment variables | |
| def get_env_settings() -> Dict[str, Any]: | |
| """Get settings from environment variables""" | |
| env_settings = {} | |
| for key, value in globals().items(): | |
| if key.isupper(): # Only consider uppercase variables as settings | |
| env_value = os.environ.get(f"WEBCRAWLER_{key}") | |
| if env_value is not None: | |
| # Convert to appropriate type based on default value | |
| if isinstance(value, bool): | |
| env_settings[key] = env_value.lower() in ("true", "1", "yes") | |
| elif isinstance(value, int): | |
| env_settings[key] = int(env_value) | |
| elif isinstance(value, float): | |
| env_settings[key] = float(env_value) | |
| elif isinstance(value, list): | |
| # Assume comma-separated values | |
| env_settings[key] = [item.strip() for item in env_value.split(",")] | |
| else: | |
| env_settings[key] = env_value | |
| return env_settings | |
| # Update settings with environment variables | |
| globals().update(get_env_settings()) |