Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Example script that demonstrates how to use the web crawler programmatically. | |
| This example: | |
| 1. Initializes the crawler | |
| 2. Adds seed URLs | |
| 3. Starts the crawler with 2 workers | |
| 4. Monitors progress for a specific duration | |
| 5. Pauses, resumes, and stops the crawler | |
| 6. Exports crawl data | |
| Usage: | |
| python example.py [--time=<seconds>] [--workers=<num>] [--async] | |
| Options: | |
| --time=<seconds> Duration of the crawl in seconds [default: 60] | |
| --workers=<num> Number of worker threads [default: 2] | |
| --async Use asynchronous mode | |
| """ | |
| import time | |
| import logging | |
| import sys | |
| import json | |
| import os | |
| import signal | |
| import threading | |
| from docopt import docopt | |
| from crawler import Crawler | |
| from models import URLStatus, Priority | |
| import config | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger('example') | |
| def log_stats(crawler, interval=5): | |
| """Log crawler statistics periodically""" | |
| stats = crawler.stats | |
| elapsed = time.time() - stats['start_time'] | |
| logger.info(f"=== Crawler Statistics (after {int(elapsed)}s) ===") | |
| logger.info(f"Pages crawled: {stats['pages_crawled']}") | |
| logger.info(f"Pages failed: {stats['pages_failed']}") | |
| logger.info(f"URLs discovered: {stats['urls_discovered']}") | |
| logger.info(f"URLs filtered: {stats['urls_filtered']}") | |
| logger.info(f"Domains crawled: {len(stats['domains_crawled'])}") | |
| logger.info(f"Frontier size: {crawler.frontier.size()}") | |
| # Status code distribution | |
| status_codes = stats['status_codes'] | |
| if status_codes: | |
| logger.info("Status code distribution:") | |
| for status, count in sorted(status_codes.items()): | |
| logger.info(f" {status}: {count}") | |
| # Check if crawler is still running | |
| if crawler.running and not crawler.stop_event.is_set(): | |
| # Schedule next logging | |
| timer = threading.Timer(interval, log_stats, args=[crawler, interval]) | |
| timer.daemon = True | |
| timer.start() | |
| def example_crawl(duration=60, workers=2, async_mode=False): | |
| """ | |
| Example crawler use | |
| Args: | |
| duration: Duration of the crawl in seconds | |
| workers: Number of worker threads | |
| async_mode: Whether to use async mode | |
| """ | |
| logger.info("Initializing web crawler...") | |
| # Initialize crawler | |
| crawler = Crawler() | |
| # Add seed URLs | |
| seed_urls = [ | |
| 'https://en.wikipedia.org/wiki/Web_crawler', | |
| 'https://en.wikipedia.org/wiki/Search_engine', | |
| 'https://en.wikipedia.org/wiki/Web_indexing', | |
| 'https://python.org', | |
| 'https://www.example.com' | |
| ] | |
| logger.info(f"Adding {len(seed_urls)} seed URLs...") | |
| crawler.add_seed_urls(seed_urls) | |
| # Set up signal handling | |
| def signal_handler(sig, frame): | |
| logger.info("Received interrupt signal, stopping crawler") | |
| crawler.stop() | |
| sys.exit(0) | |
| signal.signal(signal.SIGINT, signal_handler) | |
| # Start a thread to log stats periodically | |
| log_stats(crawler, interval=5) | |
| # Start the crawler in a separate thread | |
| logger.info(f"Starting crawler with {workers} workers (async={async_mode})...") | |
| crawler_thread = threading.Thread( | |
| target=crawler.start, | |
| kwargs={'num_workers': workers, 'async_mode': async_mode} | |
| ) | |
| crawler_thread.daemon = True | |
| crawler_thread.start() | |
| # Let the crawler run for a while | |
| logger.info(f"Crawler will run for {duration} seconds...") | |
| time.sleep(duration // 2) | |
| # Pause crawler | |
| logger.info("Pausing crawler for 5 seconds...") | |
| crawler.pause() | |
| time.sleep(5) | |
| # Resume crawler | |
| logger.info("Resuming crawler...") | |
| crawler.resume() | |
| time.sleep(duration // 2) | |
| # Stop crawler | |
| logger.info("Stopping crawler...") | |
| crawler.stop() | |
| # Wait for crawler to stop | |
| crawler_thread.join(timeout=10) | |
| # Export crawl data | |
| export_dir = os.path.join(config.STORAGE_PATH, 'exports') | |
| os.makedirs(export_dir, exist_ok=True) | |
| export_file = os.path.join(export_dir, 'example_crawl_results.json') | |
| logger.info(f"Exporting crawl data to {export_file}...") | |
| export_results(crawler, export_file) | |
| logger.info("Crawl example completed") | |
| # Print summary | |
| print_summary(crawler) | |
| def export_results(crawler, output_file): | |
| """ | |
| Export crawler results to a file | |
| Args: | |
| crawler: Crawler instance | |
| output_file: Output file path | |
| """ | |
| try: | |
| # Get MongoDB collections | |
| pages_collection = crawler.db.pages_collection | |
| urls_collection = crawler.db.urls_collection | |
| # Get data | |
| pages = list(pages_collection.find({}, {'_id': 0}).limit(1000)) | |
| urls = list(urls_collection.find({}, {'_id': 0}).limit(1000)) | |
| # Prepare export data | |
| export_data = { | |
| 'metadata': { | |
| 'crawl_duration': time.time() - crawler.stats['start_time'], | |
| 'pages_crawled': crawler.stats['pages_crawled'], | |
| 'urls_discovered': crawler.stats['urls_discovered'], | |
| 'domains_crawled': list(crawler.stats['domains_crawled']), | |
| 'exported_pages': len(pages), | |
| 'exported_urls': len(urls), | |
| 'export_timestamp': time.strftime('%Y-%m-%d %H:%M:%S') | |
| }, | |
| 'pages': pages, | |
| 'urls': urls, | |
| 'stats': crawler.stats | |
| } | |
| # Convert datetime objects to strings for JSON serialization | |
| export_data = json.loads(json.dumps(export_data, default=str)) | |
| # Write to file | |
| with open(output_file, 'w') as f: | |
| json.dump(export_data, f, indent=2) | |
| logger.info(f"Exported data to {output_file}") | |
| except Exception as e: | |
| logger.error(f"Error exporting results: {e}") | |
| def print_summary(crawler): | |
| """ | |
| Print a summary of the crawl | |
| Args: | |
| crawler: Crawler instance | |
| """ | |
| stats = crawler.stats | |
| print("\n=============== CRAWL SUMMARY ===============") | |
| print(f"Duration: {time.time() - stats['start_time']:.2f} seconds") | |
| print(f"Pages crawled: {stats['pages_crawled']}") | |
| print(f"Pages failed: {stats['pages_failed']}") | |
| print(f"URLs discovered: {stats['urls_discovered']}") | |
| print(f"URLs filtered: {stats['urls_filtered']}") | |
| print(f"Domains crawled: {len(stats['domains_crawled'])}") | |
| if stats['domains_crawled']: | |
| print("\nTop domains:") | |
| domain_counts = {} | |
| # Count pages per domain | |
| for page in crawler.db.pages_collection.find({}, {'domain': 1}): | |
| domain = page.get('domain', 'unknown') | |
| domain_counts[domain] = domain_counts.get(domain, 0) + 1 | |
| # Display top domains | |
| for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]: | |
| print(f" {domain}: {count} pages") | |
| print("\nHTTP Status Codes:") | |
| for status, count in sorted(stats['status_codes'].items()): | |
| print(f" {status}: {count}") | |
| print("\nContent Types:") | |
| for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:5]: | |
| print(f" {content_type}: {count}") | |
| print("=============================================\n") | |
| if __name__ == '__main__': | |
| # Parse command-line arguments | |
| args = docopt(__doc__) | |
| duration = int(args['--time']) | |
| workers = int(args['--workers']) | |
| async_mode = args['--async'] | |
| try: | |
| example_crawl(duration, workers, async_mode) | |
| except KeyboardInterrupt: | |
| logger.info("Example interrupted by user") | |
| except Exception as e: | |
| logger.error(f"Error in example: {e}") | |
| logger.exception(e) |