Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script to remove all web crawler data from MongoDB without interactive confirmation | |
| """ | |
| import logging | |
| from pymongo import MongoClient | |
| import sys | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' | |
| ) | |
| logger = logging.getLogger("mongo_cleanup") | |
| def cleanup_mongodb(): | |
| """Remove all web crawler data from MongoDB""" | |
| try: | |
| # Connect to MongoDB | |
| logger.info("Connecting to MongoDB...") | |
| client = MongoClient("mongodb://localhost:27017/") | |
| # Access crawler database | |
| db = client["crawler"] | |
| # List and drop all collections | |
| collections = db.list_collection_names() | |
| if not collections: | |
| logger.info("No collections found in the crawler database") | |
| else: | |
| logger.info(f"Found {len(collections)} collections to drop: {collections}") | |
| for collection in collections: | |
| logger.info(f"Dropping collection: {collection}") | |
| db[collection].drop() | |
| logger.info("All crawler collections dropped successfully") | |
| # Optionally drop the entire database | |
| logger.info("Dropping entire crawler database") | |
| client.drop_database("crawler") | |
| # Check for any URLs collection in other databases that might be related | |
| all_dbs = client.list_database_names() | |
| for db_name in all_dbs: | |
| if db_name in ['admin', 'config', 'local']: | |
| continue | |
| db = client[db_name] | |
| if 'urls' in db.list_collection_names() or 'pages' in db.list_collection_names(): | |
| logger.info(f"Found crawler-related collections in database: {db_name}") | |
| # Ask for confirmation before dropping collections in other databases | |
| for collection in ['urls', 'pages', 'domains', 'stats']: | |
| if collection in db.list_collection_names(): | |
| logger.info(f"Dropping collection {db_name}.{collection}") | |
| db[collection].drop() | |
| logger.info("MongoDB cleanup completed successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error cleaning up MongoDB: {e}") | |
| return False | |
| if __name__ == "__main__": | |
| print("MongoDB Crawler Data Cleanup") | |
| print("--------------------------") | |
| print("This script will remove all web crawler collections from MongoDB") | |
| print() | |
| if len(sys.argv) > 1 and sys.argv[1] == '--force': | |
| # Non-interactive mode for scripting | |
| success = cleanup_mongodb() | |
| sys.exit(0 if success else 1) | |
| else: | |
| # Interactive mode | |
| proceed = input("Do you want to proceed with MongoDB cleanup? (y/n): ") | |
| if proceed.lower() != 'y': | |
| print("Cleanup cancelled") | |
| sys.exit(0) | |
| success = cleanup_mongodb() | |
| print(f"\nMongoDB cleanup: {'Completed' if success else 'Failed'}") |