Spaces:
Runtime error
Runtime error
| import asyncio | |
| from medrag_multi_modal.document_loader import ( | |
| PDFPlumberTextLoader, | |
| PyMuPDF4LLMTextLoader, | |
| PyPDF2TextLoader, | |
| ) | |
| URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf" | |
| COLUMN_NAMES = [ | |
| "text", | |
| "page_idx", | |
| "document_name", | |
| "file_path", | |
| "file_url", | |
| "loader_name", | |
| ] | |
| def test_pdfplumber_text_loader(): | |
| loader = PDFPlumberTextLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
| assert dataset.num_rows == 6 | |
| assert dataset.column_names == COLUMN_NAMES | |
| def test_pymupdf_text_loader(): | |
| loader = PyMuPDF4LLMTextLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
| assert dataset.num_rows == 6 | |
| assert dataset.column_names == COLUMN_NAMES | |
| def test_pypdf2_text_loader(): | |
| loader = PyPDF2TextLoader( | |
| url=URL, | |
| document_name="Gray's Anatomy", | |
| document_file_path="grays_anatomy.pdf", | |
| ) | |
| dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
| assert dataset.num_rows == 6 | |
| assert dataset.column_names == COLUMN_NAMES | |