Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script for the web crawler - tests only the URL frontier and downloader | |
| without requiring MongoDB | |
| """ | |
| import os | |
| import sys | |
| import time | |
| import logging | |
| import threading | |
| from urllib.parse import urlparse | |
| import redis | |
| # Make sure we're in the right directory | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| os.chdir(script_dir) | |
| # Set up logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(name)s] %(levelname)s: %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout), | |
| logging.FileHandler(os.path.join(script_dir, 'test_crawler.log')) | |
| ] | |
| ) | |
| logger = logging.getLogger("test_crawler") | |
| # Import our modules | |
| import config | |
| from frontier import URLFrontier | |
| from models import URL, Priority, URLStatus | |
| from downloader import HTMLDownloader | |
| from parser import HTMLParser | |
| from robots import RobotsHandler | |
| from dns_resolver import DNSResolver | |
| # Import local configuration if available | |
| try: | |
| import local_config | |
| # Override config settings with local settings | |
| for key in dir(local_config): | |
| if key.isupper(): | |
| setattr(config, key, getattr(local_config, key)) | |
| logger.info("Loaded local configuration") | |
| except ImportError: | |
| logger.warning("No local_config.py found - using default config") | |
| def test_redis(): | |
| """Test Redis connection""" | |
| try: | |
| logger.info(f"Testing Redis connection to {config.REDIS_URI}") | |
| r = redis.from_url(config.REDIS_URI) | |
| r.ping() | |
| logger.info("Redis connection successful") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Redis connection failed: {e}") | |
| return False | |
| def test_robots_txt(): | |
| """Test robots.txt handling""" | |
| try: | |
| logger.info("Testing robots.txt handling") | |
| robots_handler = RobotsHandler() | |
| test_urls = [ | |
| "https://www.google.com/", | |
| "https://www.github.com/", | |
| "https://sagarnildas.com/", | |
| ] | |
| for url in test_urls: | |
| logger.info(f"Checking robots.txt for {url}") | |
| allowed, crawl_delay = robots_handler.can_fetch(url) | |
| logger.info(f" Allowed: {allowed}, Crawl delay: {crawl_delay}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error testing robots.txt: {e}") | |
| return False | |
| def test_dns_resolver(): | |
| """Test DNS resolver""" | |
| try: | |
| logger.info("Testing DNS resolver") | |
| dns_resolver = DNSResolver() | |
| test_domains = [ | |
| "www.google.com", | |
| "www.github.com", | |
| "example.com", | |
| ] | |
| for domain in test_domains: | |
| logger.info(f"Resolving {domain}") | |
| ip = dns_resolver.resolve(f"https://{domain}/") | |
| logger.info(f" IP: {ip}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error testing DNS resolver: {e}") | |
| return False | |
| def test_url_frontier(): | |
| """Test URL frontier""" | |
| try: | |
| logger.info("Testing URL frontier") | |
| frontier = URLFrontier() | |
| # Clear frontier | |
| frontier.clear() | |
| # Add some URLs | |
| test_urls = [ | |
| "https://www.google.com/", | |
| "https://www.github.com/", | |
| "https://sagarnildas.com/", | |
| ] | |
| for i, url in enumerate(test_urls): | |
| url_obj = URL( | |
| url=url, | |
| priority=Priority.MEDIUM, | |
| status=URLStatus.PENDING, | |
| depth=0 | |
| ) | |
| added = frontier.add_url(url_obj) | |
| logger.info(f"Added {url}: {added}") | |
| # Check size | |
| size = frontier.size() | |
| logger.info(f"Frontier size: {size}") | |
| # Get next URL | |
| url = frontier.get_next_url() | |
| if url: | |
| logger.info(f"Next URL: {url.url} (priority: {url.priority})") | |
| else: | |
| logger.info("No URL available") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error testing URL frontier: {e}") | |
| return False | |
| def test_downloader(): | |
| """Test HTML downloader""" | |
| try: | |
| logger.info("Testing HTML downloader") | |
| downloader = HTMLDownloader() | |
| test_urls = [ | |
| URL(url="https://sagarnildas.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0), | |
| URL(url="https://www.google.com/", priority=Priority.MEDIUM, status=URLStatus.PENDING, depth=0), | |
| ] | |
| for url_obj in test_urls: | |
| logger.info(f"Downloading {url_obj.url}") | |
| page = downloader.download(url_obj) | |
| if page: | |
| logger.info(f" Downloaded {page.content_length} bytes, status: {page.status_code}") | |
| # Test parsing | |
| parser = HTMLParser() | |
| urls, metadata = parser.parse(page) | |
| logger.info(f" Extracted {len(urls)} URLs and {len(metadata)} metadata items") | |
| else: | |
| logger.info(f" Download failed: {url_obj.error}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error testing HTML downloader: {e}") | |
| return False | |
| def run_tests(): | |
| """Run all tests""" | |
| logger.info("Starting crawler component tests") | |
| tests = [ | |
| ("Redis", test_redis), | |
| ("Robots.txt", test_robots_txt), | |
| ("DNS Resolver", test_dns_resolver), | |
| ("URL Frontier", test_url_frontier), | |
| ("HTML Downloader", test_downloader), | |
| ] | |
| results = [] | |
| for name, test_func in tests: | |
| logger.info(f"\n=== Testing {name} ===") | |
| start_time = time.time() | |
| success = test_func() | |
| elapsed = time.time() - start_time | |
| result = { | |
| "name": name, | |
| "success": success, | |
| "time": elapsed | |
| } | |
| results.append(result) | |
| logger.info(f"=== {name} test {'succeeded' if success else 'failed'} in {elapsed:.2f}s ===\n") | |
| # Print summary | |
| logger.info("\n=== Test Summary ===") | |
| all_success = True | |
| for result in results: | |
| status = "SUCCESS" if result["success"] else "FAILED" | |
| logger.info(f"{result['name']}: {status} ({result['time']:.2f}s)") | |
| if not result["success"]: | |
| all_success = False | |
| if all_success: | |
| logger.info("All tests passed!") | |
| else: | |
| logger.warning("Some tests failed. Check logs for details.") | |
| return all_success | |
| if __name__ == "__main__": | |
| run_tests() |