| #!/usr/bin/env python3 | |
| """ | |
| Test downloading files from URLs | |
| """ | |
| import requests | |
| import pandas as pd | |
| import PyPDF2 | |
| from io import BytesIO | |
| def test_file_download(): | |
| """Test downloading different file types from URLs""" | |
| # Example URLs (these are hypothetical) | |
| test_urls = [ | |
| { | |
| "url": "https://example.com/sales_data.xlsx", | |
| "type": "excel", | |
| "question": "What is the total sales from the Excel file at https://example.com/sales_data.xlsx?" | |
| }, | |
| { | |
| "url": "https://example.com/document.pdf", | |
| "type": "pdf", | |
| "question": "How many times does 'therefore' appear in https://example.com/document.pdf?" | |
| } | |
| ] | |
| for test in test_urls: | |
| print(f"\nTesting {test['type']} download:") | |
| print(f"URL: {test['url']}") | |
| try: | |
| # Download the file | |
| response = requests.get(test['url'], timeout=10) | |
| if response.status_code == 200: | |
| print("β File downloaded successfully") | |
| # Process based on file type | |
| if test['type'] == 'excel': | |
| # Read Excel file | |
| df = pd.read_excel(BytesIO(response.content)) | |
| print(f"Excel shape: {df.shape}") | |
| print(f"Columns: {list(df.columns)}") | |
| elif test['type'] == 'pdf': | |
| # Read PDF file | |
| pdf_reader = PyPDF2.PdfReader(BytesIO(response.content)) | |
| print(f"PDF pages: {len(pdf_reader.pages)}") | |
| else: | |
| print(f"β Failed to download: {response.status_code}") | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| if __name__ == "__main__": | |
| test_file_download() |