Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import asyncio | |
| import shutil | |
| from typing import List | |
| import tempfile | |
| import time | |
| # Add the parent directory to the Python path | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(parent_dir) | |
| from crawl4ai.async_webcrawler import AsyncWebCrawler | |
| class TestDownloads: | |
| def __init__(self): | |
| self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") | |
| self.download_dir = os.path.join(self.temp_dir, "downloads") | |
| os.makedirs(self.download_dir, exist_ok=True) | |
| self.results: List[str] = [] | |
| def cleanup(self): | |
| shutil.rmtree(self.temp_dir) | |
| def log_result(self, test_name: str, success: bool, message: str = ""): | |
| result = f"{'β ' if success else 'β'} {test_name}: {message}" | |
| self.results.append(result) | |
| print(result) | |
| async def test_basic_download(self): | |
| """Test basic file download functionality""" | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| downloads_path=self.download_dir, | |
| verbose=True | |
| ) as crawler: | |
| # Python.org downloads page typically has stable download links | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code=""" | |
| // Click first download link | |
| const downloadLink = document.querySelector('a[href$=".exe"]'); | |
| if (downloadLink) downloadLink.click(); | |
| """ | |
| ) | |
| success = result.downloaded_files is not None and len(result.downloaded_files) > 0 | |
| self.log_result( | |
| "Basic Download", | |
| success, | |
| f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" | |
| ) | |
| except Exception as e: | |
| self.log_result("Basic Download", False, str(e)) | |
| async def test_persistent_context_download(self): | |
| """Test downloads with persistent context""" | |
| try: | |
| user_data_dir = os.path.join(self.temp_dir, "user_data") | |
| os.makedirs(user_data_dir, exist_ok=True) | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| downloads_path=self.download_dir, | |
| use_persistent_context=True, | |
| user_data_dir=user_data_dir, | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code=""" | |
| const downloadLink = document.querySelector('a[href$=".exe"]'); | |
| if (downloadLink) downloadLink.click(); | |
| """ | |
| ) | |
| success = result.downloaded_files is not None and len(result.downloaded_files) > 0 | |
| self.log_result( | |
| "Persistent Context Download", | |
| success, | |
| f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" | |
| ) | |
| except Exception as e: | |
| self.log_result("Persistent Context Download", False, str(e)) | |
| async def test_multiple_downloads(self): | |
| """Test multiple simultaneous downloads""" | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| downloads_path=self.download_dir, | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code=""" | |
| // Click multiple download links | |
| const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); | |
| downloadLinks.forEach(link => link.click()); | |
| """ | |
| ) | |
| success = result.downloaded_files is not None and len(result.downloaded_files) > 1 | |
| self.log_result( | |
| "Multiple Downloads", | |
| success, | |
| f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" | |
| ) | |
| except Exception as e: | |
| self.log_result("Multiple Downloads", False, str(e)) | |
| async def test_different_browsers(self): | |
| """Test downloads across different browser types""" | |
| browsers = ["chromium", "firefox", "webkit"] | |
| for browser_type in browsers: | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| downloads_path=self.download_dir, | |
| browser_type=browser_type, | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code=""" | |
| const downloadLink = document.querySelector('a[href$=".exe"]'); | |
| if (downloadLink) downloadLink.click(); | |
| """ | |
| ) | |
| success = result.downloaded_files is not None and len(result.downloaded_files) > 0 | |
| self.log_result( | |
| f"{browser_type.title()} Download", | |
| success, | |
| f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" | |
| ) | |
| except Exception as e: | |
| self.log_result(f"{browser_type.title()} Download", False, str(e)) | |
| async def test_edge_cases(self): | |
| """Test various edge cases""" | |
| # Test 1: Downloads without specifying download path | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code="document.querySelector('a[href$=\".exe\"]').click()" | |
| ) | |
| self.log_result( | |
| "Default Download Path", | |
| True, | |
| f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" | |
| ) | |
| except Exception as e: | |
| self.log_result("Default Download Path", False, str(e)) | |
| # Test 2: Downloads with invalid path | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=True, | |
| downloads_path="/invalid/path/that/doesnt/exist", | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code="document.querySelector('a[href$=\".exe\"]').click()" | |
| ) | |
| self.log_result("Invalid Download Path", False, "Should have raised an error") | |
| except Exception as e: | |
| self.log_result("Invalid Download Path", True, "Correctly handled invalid path") | |
| # Test 3: Download with accept_downloads=False | |
| try: | |
| async with AsyncWebCrawler( | |
| accept_downloads=False, | |
| verbose=True | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.python.org/downloads/", | |
| js_code="document.querySelector('a[href$=\".exe\"]').click()" | |
| ) | |
| success = result.downloaded_files is None | |
| self.log_result( | |
| "Disabled Downloads", | |
| success, | |
| "Correctly ignored downloads" if success else "Unexpectedly downloaded files" | |
| ) | |
| except Exception as e: | |
| self.log_result("Disabled Downloads", False, str(e)) | |
| async def run_all_tests(self): | |
| """Run all test cases""" | |
| print("\nπ§ͺ Running Download Tests...\n") | |
| test_methods = [ | |
| self.test_basic_download, | |
| self.test_persistent_context_download, | |
| self.test_multiple_downloads, | |
| self.test_different_browsers, | |
| self.test_edge_cases | |
| ] | |
| for test in test_methods: | |
| print(f"\nπ Running {test.__doc__}...") | |
| await test() | |
| await asyncio.sleep(2) # Brief pause between tests | |
| print("\nπ Test Results Summary:") | |
| for result in self.results: | |
| print(result) | |
| successes = len([r for r in self.results if 'β ' in r]) | |
| total = len(self.results) | |
| print(f"\nTotal: {successes}/{total} tests passed") | |
| self.cleanup() | |
| async def main(): | |
| tester = TestDownloads() | |
| await tester.run_all_tests() | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |