Spaces:
Runtime error
Runtime error
| import asyncio | |
| from medrag_multi_modal.document_loader.image_loader import ( | |
| FitzPILImageLoader, | |
| PDF2ImageLoader, | |
| PDFPlumberImageLoader, | |
| PyMuPDFImageLoader, | |
| ) | |
| URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf" | |
| COLUMN_NAMES = ["page_image", "page_figure_images", "document_name", "page_idx"] | |
| def test_fitzpil_img_loader(): | |
| loader = FitzPILImageLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=32, end_page=37)) | |
| assert dataset.num_rows == 5 | |
| assert dataset.column_names == COLUMN_NAMES | |
| loader.cleanup_image_dir() | |
| def test_pdf2image_img_loader(): | |
| loader = PDF2ImageLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=32, end_page=37)) | |
| assert dataset.num_rows == 5 | |
| assert dataset.column_names == COLUMN_NAMES | |
| loader.cleanup_image_dir() | |
| def test_pdfplumber_img_loader(): | |
| loader = PDFPlumberImageLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=32, end_page=37)) | |
| assert dataset.num_rows == 5 | |
| assert dataset.column_names == COLUMN_NAMES | |
| loader.cleanup_image_dir() | |
| def test_pymupdf_img_loader(): | |
| loader = PyMuPDFImageLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=32, end_page=37)) | |
| assert dataset.num_rows == 5 | |
| assert dataset.column_names == COLUMN_NAMES | |
| loader.cleanup_image_dir() | |