Spaces:
Runtime error
Runtime error
Merge pull request #14 from soumik12345/feat/ensemble-of-image-loaders
Browse files- docs/document_loader/image_loader/base_img_loader.md +3 -0
- docs/document_loader/image_loader/fitzpil_img_loader.md +22 -0
- docs/document_loader/image_loader/marker_img_loader.md +21 -0
- docs/document_loader/image_loader/pdf2image_img_loader.md +26 -0
- docs/document_loader/image_loader/pdfplumber_img_loader.md +22 -0
- docs/document_loader/image_loader/pymupdf_img_loader.md +23 -0
- docs/document_loader/load_image.md +0 -3
- medrag_multi_modal/document_loader/__init__.py +12 -4
- medrag_multi_modal/document_loader/image_loader/__init__.py +13 -0
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +113 -0
- medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py +127 -0
- medrag_multi_modal/document_loader/image_loader/marker_img_loader.py +100 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +92 -0
- medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py +101 -0
- medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py +124 -0
- medrag_multi_modal/document_loader/load_image.py +0 -131
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +10 -9
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +9 -8
- medrag_multi_modal/retrieval/multi_modal_retrieval.py +2 -1
- mkdocs.yml +7 -1
docs/document_loader/image_loader/base_img_loader.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Load images from PDF files
|
| 2 |
+
|
| 3 |
+
::: medrag_multi_modal.document_loader.image_loader.base_img_loader
|
docs/document_loader/image_loader/fitzpil_img_loader.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using Fitz & PIL)
|
| 2 |
+
|
| 3 |
+
??? note "Note"
|
| 4 |
+
**Underlying Library:** `fitz` & `pillow`
|
| 5 |
+
|
| 6 |
+
Extract images from PDF files using `fitz` and `pillow`.
|
| 7 |
+
|
| 8 |
+
Use it in our library with:
|
| 9 |
+
```python
|
| 10 |
+
from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
For more details, please refer to the sources below.
|
| 14 |
+
|
| 15 |
+
**Sources:**
|
| 16 |
+
|
| 17 |
+
- [Docs](https://pymupdf.readthedocs.io/en/latest/intro.html)
|
| 18 |
+
- [GitHub](https://github.com/kastman/fitz)
|
| 19 |
+
- [PyPI](https://pypi.org/project/fitz/)
|
| 20 |
+
- [PyPI](https://pypi.org/project/pillow/)
|
| 21 |
+
|
| 22 |
+
::: medrag_multi_modal.document_loader.image_loader.fitzpil_img_loader
|
docs/document_loader/image_loader/marker_img_loader.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using Marker)
|
| 2 |
+
|
| 3 |
+
??? note "Note"
|
| 4 |
+
**Underlying Library:** `marker-pdf`
|
| 5 |
+
|
| 6 |
+
Extract images from PDF files using `marker-pdf`.
|
| 7 |
+
|
| 8 |
+
Use it in our library with:
|
| 9 |
+
```python
|
| 10 |
+
from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
For details, please refer to the sources below.
|
| 14 |
+
|
| 15 |
+
**Sources:**
|
| 16 |
+
|
| 17 |
+
- [DataLab](https://www.datalab.to)
|
| 18 |
+
- [GitHub](https://github.com/VikParuchuri/marker)
|
| 19 |
+
- [PyPI](https://pypi.org/project/marker-pdf/)
|
| 20 |
+
|
| 21 |
+
::: medrag_multi_modal.document_loader.image_loader.marker_img_loader
|
docs/document_loader/image_loader/pdf2image_img_loader.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using PDF2Image)
|
| 2 |
+
|
| 3 |
+
!!! danger "Warning"
|
| 4 |
+
Unlike other image extraction methods in `document_loader.image_loader`, this loader does not extract embedded images from the PDF.
|
| 5 |
+
Instead, it creates a snapshot image version of each selected page from the PDF.
|
| 6 |
+
|
| 7 |
+
??? note "Note"
|
| 8 |
+
**Underlying Library:** `pdf2image`
|
| 9 |
+
|
| 10 |
+
Extract images from PDF files using `pdf2image`.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
Use it in our library with:
|
| 14 |
+
```python
|
| 15 |
+
from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
For details and available `**kwargs`, please refer to the sources below.
|
| 19 |
+
|
| 20 |
+
**Sources:**
|
| 21 |
+
|
| 22 |
+
- [DataLab](https://www.datalab.to)
|
| 23 |
+
- [GitHub](https://github.com/VikParuchuri/marker)
|
| 24 |
+
- [PyPI](https://pypi.org/project/marker-pdf/)
|
| 25 |
+
|
| 26 |
+
::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
|
docs/document_loader/image_loader/pdfplumber_img_loader.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using PDFPlumber)
|
| 2 |
+
|
| 3 |
+
??? note "Note"
|
| 4 |
+
**Underlying Library:** `pdfplumber`
|
| 5 |
+
|
| 6 |
+
Extract images from PDF files using `pdfplumber`.
|
| 7 |
+
|
| 8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
| 9 |
+
|
| 10 |
+
Use it in our library with:
|
| 11 |
+
```python
|
| 12 |
+
from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
For details, please refer to the sources below.
|
| 16 |
+
|
| 17 |
+
**Sources:**
|
| 18 |
+
|
| 19 |
+
- [GitHub](https://github.com/jsvine/pdfplumber)
|
| 20 |
+
- [PyPI](https://pypi.org/project/pdfplumber/)
|
| 21 |
+
|
| 22 |
+
::: medrag_multi_modal.document_loader.image_loader.pdfplumber_img_loader
|
docs/document_loader/image_loader/pymupdf_img_loader.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using PyMuPDF)
|
| 2 |
+
|
| 3 |
+
??? note "Note"
|
| 4 |
+
**Underlying Library:** `pymupdf`
|
| 5 |
+
|
| 6 |
+
PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
|
| 7 |
+
|
| 8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
| 9 |
+
|
| 10 |
+
Use it in our library with:
|
| 11 |
+
```python
|
| 12 |
+
from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
For details, please refer to the sources below.
|
| 16 |
+
|
| 17 |
+
**Sources:**
|
| 18 |
+
|
| 19 |
+
- [Docs](https://pymupdf.readthedocs.io/en/latest/)
|
| 20 |
+
- [GitHub](https://github.com/pymupdf/PyMuPDF)
|
| 21 |
+
- [PyPI](https://pypi.org/project/PyMuPDF/)
|
| 22 |
+
|
| 23 |
+
::: medrag_multi_modal.document_loader.image_loader.pymupdf_img_loader
|
docs/document_loader/load_image.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
# Load PDF pages as images
|
| 2 |
-
|
| 3 |
-
::: medrag_multi_modal.document_loader.load_image
|
|
|
|
|
|
|
|
|
|
|
|
medrag_multi_modal/document_loader/__init__.py
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
-
from .
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from .text_loader import (
|
| 4 |
MarkerTextLoader,
|
| 5 |
PDFPlumberTextLoader,
|
|
@@ -12,6 +17,9 @@ __all__ = [
|
|
| 12 |
"PyPDF2TextLoader",
|
| 13 |
"PDFPlumberTextLoader",
|
| 14 |
"MarkerTextLoader",
|
| 15 |
-
"
|
| 16 |
-
"
|
|
|
|
|
|
|
|
|
|
| 17 |
]
|
|
|
|
| 1 |
+
from .image_loader import (
|
| 2 |
+
FitzPILImageLoader,
|
| 3 |
+
MarkerImageLoader,
|
| 4 |
+
PDF2ImageLoader,
|
| 5 |
+
PDFPlumberImageLoader,
|
| 6 |
+
PyMuPDFImageLoader,
|
| 7 |
+
)
|
| 8 |
from .text_loader import (
|
| 9 |
MarkerTextLoader,
|
| 10 |
PDFPlumberTextLoader,
|
|
|
|
| 17 |
"PyPDF2TextLoader",
|
| 18 |
"PDFPlumberTextLoader",
|
| 19 |
"MarkerTextLoader",
|
| 20 |
+
"PDF2ImageLoader",
|
| 21 |
+
"MarkerImageLoader",
|
| 22 |
+
"PDFPlumberImageLoader",
|
| 23 |
+
"PyMuPDFImageLoader",
|
| 24 |
+
"FitzPILImageLoader",
|
| 25 |
]
|
medrag_multi_modal/document_loader/image_loader/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .fitzpil_img_loader import FitzPILImageLoader
|
| 2 |
+
from .marker_img_loader import MarkerImageLoader
|
| 3 |
+
from .pdf2image_img_loader import PDF2ImageLoader
|
| 4 |
+
from .pdfplumber_img_loader import PDFPlumberImageLoader
|
| 5 |
+
from .pymupdf_img_loader import PyMuPDFImageLoader
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"PDF2ImageLoader",
|
| 9 |
+
"MarkerImageLoader",
|
| 10 |
+
"PDFPlumberImageLoader",
|
| 11 |
+
"PyMuPDFImageLoader",
|
| 12 |
+
"FitzPILImageLoader",
|
| 13 |
+
]
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from abc import abstractmethod
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
|
| 6 |
+
import rich
|
| 7 |
+
|
| 8 |
+
import wandb
|
| 9 |
+
from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
|
| 10 |
+
BaseTextLoader,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseImageLoader(BaseTextLoader):
|
| 15 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 16 |
+
super().__init__(url, document_name, document_file_path)
|
| 17 |
+
|
| 18 |
+
@abstractmethod
|
| 19 |
+
async def extract_page_data(
|
| 20 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 21 |
+
) -> Dict[str, str]:
|
| 22 |
+
"""
|
| 23 |
+
Abstract method to process a single page of the PDF and extract the image data.
|
| 24 |
+
|
| 25 |
+
Overwrite this method in the subclass to provide the actual implementation and
|
| 26 |
+
processing logic for each page of the PDF using various PDF processing libraries.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
page_idx (int): The index of the page to process.
|
| 30 |
+
image_save_dir (str): The directory to save the extracted images.
|
| 31 |
+
**kwargs: Additional keyword arguments that may be used by underlying libraries.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
| 35 |
+
"""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
async def load_data(
|
| 39 |
+
self,
|
| 40 |
+
start_page: Optional[int] = None,
|
| 41 |
+
end_page: Optional[int] = None,
|
| 42 |
+
wandb_artifact_name: Optional[str] = None,
|
| 43 |
+
image_save_dir: str = "./images",
|
| 44 |
+
cleanup: bool = True,
|
| 45 |
+
**kwargs,
|
| 46 |
+
) -> List[Dict[str, str]]:
|
| 47 |
+
"""
|
| 48 |
+
Asynchronously loads images from a PDF file specified by a URL or local file path.
|
| 49 |
+
The overrided processing abstract method then processes the images,
|
| 50 |
+
and optionally publishes it to a WandB artifact.
|
| 51 |
+
|
| 52 |
+
This function downloads a PDF from a given URL if it does not already exist locally,
|
| 53 |
+
reads the specified range of pages, scans each page's content to extract images, and
|
| 54 |
+
returns a list of Page objects containing the images and metadata.
|
| 55 |
+
|
| 56 |
+
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
| 57 |
+
overriden `extract_page_data` method provides the actual implementation to process
|
| 58 |
+
each page, extract the image content from the PDF, and convert it to png format.
|
| 59 |
+
It processes pages concurrently using `asyncio` for efficiency.
|
| 60 |
+
|
| 61 |
+
If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
| 65 |
+
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
| 66 |
+
wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
|
| 67 |
+
image_save_dir (str): The directory to save the extracted images.
|
| 68 |
+
cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
|
| 69 |
+
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
|
| 73 |
+
Each dictionary will have the following keys and values:
|
| 74 |
+
|
| 75 |
+
- "page_idx": (int) the index of the page.
|
| 76 |
+
- "document_name": (str) the name of the document.
|
| 77 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 78 |
+
- "file_url": (str) the URL of the PDF file.
|
| 79 |
+
- "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
|
| 80 |
+
Raises:
|
| 81 |
+
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
| 82 |
+
"""
|
| 83 |
+
os.makedirs(image_save_dir, exist_ok=True)
|
| 84 |
+
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 85 |
+
pages = []
|
| 86 |
+
processed_pages_counter: int = 1
|
| 87 |
+
total_pages = end_page - start_page
|
| 88 |
+
|
| 89 |
+
async def process_page(page_idx):
|
| 90 |
+
nonlocal processed_pages_counter
|
| 91 |
+
page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
|
| 92 |
+
pages.append(page_data)
|
| 93 |
+
rich.print(
|
| 94 |
+
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
| 95 |
+
)
|
| 96 |
+
processed_pages_counter += 1
|
| 97 |
+
|
| 98 |
+
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 99 |
+
for task in asyncio.as_completed(tasks):
|
| 100 |
+
await task
|
| 101 |
+
|
| 102 |
+
if wandb_artifact_name:
|
| 103 |
+
artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
|
| 104 |
+
artifact.add_dir(local_path=image_save_dir)
|
| 105 |
+
artifact.save()
|
| 106 |
+
rich.print("Artifact saved and uploaded to wandb!")
|
| 107 |
+
|
| 108 |
+
if cleanup:
|
| 109 |
+
for file in os.listdir(image_save_dir):
|
| 110 |
+
file_path = os.path.join(image_save_dir, file)
|
| 111 |
+
if os.path.isfile(file_path):
|
| 112 |
+
os.remove(file_path)
|
| 113 |
+
return pages
|
medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
import fitz
|
| 6 |
+
from PIL import Image, ImageOps, UnidentifiedImageError
|
| 7 |
+
|
| 8 |
+
from .base_img_loader import BaseImageLoader
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class FitzPILImageLoader(BaseImageLoader):
|
| 12 |
+
"""
|
| 13 |
+
`FitzPILImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 14 |
+
loading of pages from a PDF file as images using the fitz and PIL libraries.
|
| 15 |
+
|
| 16 |
+
This class provides functionality to extract images from a PDF file using fitz and PIL libraries,
|
| 17 |
+
and optionally publish these images to a WandB artifact.
|
| 18 |
+
|
| 19 |
+
!!! example "Example Usage"
|
| 20 |
+
```python
|
| 21 |
+
import asyncio
|
| 22 |
+
|
| 23 |
+
import weave
|
| 24 |
+
|
| 25 |
+
import wandb
|
| 26 |
+
from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
|
| 27 |
+
|
| 28 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 29 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 30 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 31 |
+
loader = FitzPILImageLoader(
|
| 32 |
+
url=url,
|
| 33 |
+
document_name="Gray's Anatomy",
|
| 34 |
+
document_file_path="grays_anatomy.pdf",
|
| 35 |
+
)
|
| 36 |
+
asyncio.run(
|
| 37 |
+
loader.load_data(
|
| 38 |
+
start_page=32,
|
| 39 |
+
end_page=37,
|
| 40 |
+
wandb_artifact_name="grays-anatomy-images-fitzpil",
|
| 41 |
+
cleanup=False,
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
url (str): The URL of the PDF document.
|
| 48 |
+
document_name (str): The name of the document.
|
| 49 |
+
document_file_path (str): The path to the PDF file.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 53 |
+
super().__init__(url, document_name, document_file_path)
|
| 54 |
+
|
| 55 |
+
async def extract_page_data(
|
| 56 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 57 |
+
) -> Dict[str, Any]:
|
| 58 |
+
"""
|
| 59 |
+
Extracts a single page from the PDF as an image using fitz and PIL libraries.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
page_idx (int): The index of the page to process.
|
| 63 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 64 |
+
**kwargs: Additional keyword arguments that may be used by fitz and PIL.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 68 |
+
The dictionary will have the following keys and values:
|
| 69 |
+
|
| 70 |
+
- "page_idx": (int) the index of the page.
|
| 71 |
+
- "document_name": (str) the name of the document.
|
| 72 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 73 |
+
- "file_url": (str) the URL of the PDF file.
|
| 74 |
+
- "image_file_paths": (list) the local file paths where the images are stored.
|
| 75 |
+
"""
|
| 76 |
+
image_file_paths = []
|
| 77 |
+
|
| 78 |
+
pdf_document = fitz.open(self.document_file_path)
|
| 79 |
+
page = pdf_document.load_page(page_idx)
|
| 80 |
+
|
| 81 |
+
images = page.get_images(full=True)
|
| 82 |
+
for img_idx, image in enumerate(images):
|
| 83 |
+
xref = image[0]
|
| 84 |
+
base_image = pdf_document.extract_image(xref)
|
| 85 |
+
image_bytes = base_image["image"]
|
| 86 |
+
image_ext = base_image["ext"]
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
img = Image.open(io.BytesIO(image_bytes))
|
| 90 |
+
|
| 91 |
+
if img.mode in ["L"]:
|
| 92 |
+
# images in greyscale looks inverted, need to test on other PDFs
|
| 93 |
+
img = ImageOps.invert(img)
|
| 94 |
+
|
| 95 |
+
if img.mode == "CMYK":
|
| 96 |
+
img = img.convert("RGB")
|
| 97 |
+
|
| 98 |
+
if image_ext not in ["png", "jpg", "jpeg"]:
|
| 99 |
+
image_ext = "png"
|
| 100 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.png"
|
| 101 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 102 |
+
|
| 103 |
+
img.save(image_file_path, format="PNG")
|
| 104 |
+
else:
|
| 105 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
|
| 106 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 107 |
+
|
| 108 |
+
with open(image_file_path, "wb") as image_file:
|
| 109 |
+
image_file.write(image_bytes)
|
| 110 |
+
|
| 111 |
+
image_file_paths.append(image_file_path)
|
| 112 |
+
|
| 113 |
+
except (UnidentifiedImageError, OSError) as e:
|
| 114 |
+
print(
|
| 115 |
+
f"Skipping image at page {page_idx}, fig {img_idx} due to an error: {e}"
|
| 116 |
+
)
|
| 117 |
+
continue
|
| 118 |
+
|
| 119 |
+
pdf_document.close()
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"page_idx": page_idx,
|
| 123 |
+
"document_name": self.document_name,
|
| 124 |
+
"file_path": self.document_file_path,
|
| 125 |
+
"file_url": self.url,
|
| 126 |
+
"image_file_paths": image_file_paths,
|
| 127 |
+
}
|
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
from marker.convert import convert_single_pdf
|
| 5 |
+
from marker.models import load_all_models
|
| 6 |
+
|
| 7 |
+
from .base_img_loader import BaseImageLoader
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MarkerImageLoader(BaseImageLoader):
|
| 11 |
+
"""
|
| 12 |
+
`MarkerImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 13 |
+
loading of pages from a PDF file as images using the marker library.
|
| 14 |
+
|
| 15 |
+
This class provides functionality to extract images from a PDF file using marker library,
|
| 16 |
+
and optionally publish these images to a WandB artifact.
|
| 17 |
+
|
| 18 |
+
!!! example "Example Usage"
|
| 19 |
+
```python
|
| 20 |
+
import asyncio
|
| 21 |
+
|
| 22 |
+
import weave
|
| 23 |
+
|
| 24 |
+
import wandb
|
| 25 |
+
from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
|
| 26 |
+
|
| 27 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 28 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 29 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 30 |
+
loader = MarkerImageLoader(
|
| 31 |
+
url=url,
|
| 32 |
+
document_name="Gray's Anatomy",
|
| 33 |
+
document_file_path="grays_anatomy.pdf",
|
| 34 |
+
)
|
| 35 |
+
asyncio.run(
|
| 36 |
+
loader.load_data(
|
| 37 |
+
start_page=31,
|
| 38 |
+
end_page=36,
|
| 39 |
+
wandb_artifact_name="grays-anatomy-images-marker",
|
| 40 |
+
cleanup=False,
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
url (str): The URL of the PDF document.
|
| 47 |
+
document_name (str): The name of the document.
|
| 48 |
+
document_file_path (str): The path to the PDF file.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 52 |
+
super().__init__(url, document_name, document_file_path)
|
| 53 |
+
self.model_lst = load_all_models()
|
| 54 |
+
|
| 55 |
+
async def extract_page_data(
|
| 56 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 57 |
+
) -> Dict[str, Any]:
|
| 58 |
+
"""
|
| 59 |
+
Extracts a single page from the PDF as an image using marker library.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
page_idx (int): The index of the page to process.
|
| 63 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 64 |
+
**kwargs: Additional keyword arguments that may be used by marker.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 68 |
+
The dictionary will have the following keys and values:
|
| 69 |
+
|
| 70 |
+
- "page_idx": (int) the index of the page.
|
| 71 |
+
- "document_name": (str) the name of the document.
|
| 72 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 73 |
+
- "file_url": (str) the URL of the PDF file.
|
| 74 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
| 75 |
+
"""
|
| 76 |
+
_, images, out_meta = convert_single_pdf(
|
| 77 |
+
self.document_file_path,
|
| 78 |
+
self.model_lst,
|
| 79 |
+
max_pages=1,
|
| 80 |
+
batch_multiplier=1,
|
| 81 |
+
start_page=page_idx,
|
| 82 |
+
ocr_all_pages=True,
|
| 83 |
+
**kwargs,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
image_file_paths = []
|
| 87 |
+
for img_idx, (_, image) in enumerate(images.items()):
|
| 88 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.png"
|
| 89 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 90 |
+
image.save(image_file_path, "png")
|
| 91 |
+
image_file_paths.append(image_file_path)
|
| 92 |
+
|
| 93 |
+
return {
|
| 94 |
+
"page_idx": page_idx,
|
| 95 |
+
"document_name": self.document_name,
|
| 96 |
+
"file_path": self.document_file_path,
|
| 97 |
+
"file_url": self.url,
|
| 98 |
+
"image_file_paths": image_file_paths,
|
| 99 |
+
"meta": out_meta,
|
| 100 |
+
}
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
from pdf2image.pdf2image import convert_from_path
|
| 5 |
+
|
| 6 |
+
from .base_img_loader import BaseImageLoader
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PDF2ImageLoader(BaseImageLoader):
|
| 10 |
+
"""
|
| 11 |
+
`PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 12 |
+
loading of pages from a PDF file as images using the pdf2image library.
|
| 13 |
+
|
| 14 |
+
This class provides functionality to convert specific pages of a PDF document into images
|
| 15 |
+
and optionally publish these images to a WandB artifact.
|
| 16 |
+
It is like a snapshot image version of each of the pages from the PDF.
|
| 17 |
+
|
| 18 |
+
!!! example "Example Usage"
|
| 19 |
+
```python
|
| 20 |
+
import asyncio
|
| 21 |
+
|
| 22 |
+
import weave
|
| 23 |
+
|
| 24 |
+
import wandb
|
| 25 |
+
from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
|
| 26 |
+
|
| 27 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 28 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 29 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 30 |
+
loader = PDF2ImageLoader(
|
| 31 |
+
url=url,
|
| 32 |
+
document_name="Gray's Anatomy",
|
| 33 |
+
document_file_path="grays_anatomy.pdf",
|
| 34 |
+
)
|
| 35 |
+
asyncio.run(
|
| 36 |
+
loader.load_data(
|
| 37 |
+
start_page=31,
|
| 38 |
+
end_page=36,
|
| 39 |
+
wandb_artifact_name="grays-anatomy-images-pdf2image",
|
| 40 |
+
cleanup=False,
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
url (str): The URL of the PDF document.
|
| 47 |
+
document_name (str): The name of the document.
|
| 48 |
+
document_file_path (str): The path to the PDF file.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 52 |
+
super().__init__(url, document_name, document_file_path)
|
| 53 |
+
|
| 54 |
+
async def extract_page_data(
|
| 55 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 56 |
+
) -> Dict[str, Any]:
|
| 57 |
+
"""
|
| 58 |
+
Extracts a single page from the PDF as an image using pdf2image library.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
page_idx (int): The index of the page to process.
|
| 62 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 63 |
+
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 67 |
+
The dictionary will have the following keys and values:
|
| 68 |
+
|
| 69 |
+
- "page_idx": (int) the index of the page.
|
| 70 |
+
- "document_name": (str) the name of the document.
|
| 71 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 72 |
+
- "file_url": (str) the URL of the PDF file.
|
| 73 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
| 74 |
+
"""
|
| 75 |
+
image = convert_from_path(
|
| 76 |
+
self.document_file_path,
|
| 77 |
+
first_page=page_idx + 1,
|
| 78 |
+
last_page=page_idx + 1,
|
| 79 |
+
**kwargs,
|
| 80 |
+
)[0]
|
| 81 |
+
|
| 82 |
+
image_file_name = f"page{page_idx}.png"
|
| 83 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 84 |
+
image.save(image_file_path)
|
| 85 |
+
|
| 86 |
+
return {
|
| 87 |
+
"page_idx": page_idx,
|
| 88 |
+
"document_name": self.document_name,
|
| 89 |
+
"file_path": self.document_file_path,
|
| 90 |
+
"file_url": self.url,
|
| 91 |
+
"image_file_path": image_file_path,
|
| 92 |
+
}
|
medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
import pdfplumber
|
| 5 |
+
|
| 6 |
+
from .base_img_loader import BaseImageLoader
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PDFPlumberImageLoader(BaseImageLoader):
|
| 10 |
+
"""
|
| 11 |
+
`PDFPlumberImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 12 |
+
loading of pages from a PDF file as images using the pdfplumber library.
|
| 13 |
+
|
| 14 |
+
This class provides functionality to extract images from a PDF file using pdfplumber library,
|
| 15 |
+
and optionally publish these images to a WandB artifact.
|
| 16 |
+
|
| 17 |
+
!!! example "Example Usage"
|
| 18 |
+
```python
|
| 19 |
+
import asyncio
|
| 20 |
+
|
| 21 |
+
import weave
|
| 22 |
+
|
| 23 |
+
import wandb
|
| 24 |
+
from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
|
| 25 |
+
|
| 26 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 27 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 28 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 29 |
+
loader = PDFPlumberImageLoader(
|
| 30 |
+
url=url,
|
| 31 |
+
document_name="Gray's Anatomy",
|
| 32 |
+
document_file_path="grays_anatomy.pdf",
|
| 33 |
+
)
|
| 34 |
+
asyncio.run(
|
| 35 |
+
loader.load_data(
|
| 36 |
+
start_page=32,
|
| 37 |
+
end_page=37,
|
| 38 |
+
wandb_artifact_name="grays-anatomy-images-pdfplumber",
|
| 39 |
+
cleanup=False,
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
url (str): The URL of the PDF document.
|
| 46 |
+
document_name (str): The name of the document.
|
| 47 |
+
document_file_path (str): The path to the PDF file.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 51 |
+
super().__init__(url, document_name, document_file_path)
|
| 52 |
+
|
| 53 |
+
async def extract_page_data(
|
| 54 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 55 |
+
) -> Dict[str, Any]:
|
| 56 |
+
"""
|
| 57 |
+
Extracts a single page from the PDF as an image using pdfplumber library.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
page_idx (int): The index of the page to process.
|
| 61 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 62 |
+
**kwargs: Additional keyword arguments that may be used by pdfplumber.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 66 |
+
The dictionary will have the following keys and values:
|
| 67 |
+
|
| 68 |
+
- "page_idx": (int) the index of the page.
|
| 69 |
+
- "document_name": (str) the name of the document.
|
| 70 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 71 |
+
- "file_url": (str) the URL of the PDF file.
|
| 72 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
| 73 |
+
"""
|
| 74 |
+
with pdfplumber.open(self.document_file_path) as pdf:
|
| 75 |
+
page = pdf.pages[page_idx]
|
| 76 |
+
images = page.images
|
| 77 |
+
|
| 78 |
+
image_file_paths = []
|
| 79 |
+
for img_idx, image in enumerate(images):
|
| 80 |
+
extracted_image = page.crop(
|
| 81 |
+
(
|
| 82 |
+
image["x0"],
|
| 83 |
+
image["top"],
|
| 84 |
+
image["x1"],
|
| 85 |
+
image["bottom"],
|
| 86 |
+
)
|
| 87 |
+
).to_image(resolution=300)
|
| 88 |
+
|
| 89 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.png"
|
| 90 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 91 |
+
|
| 92 |
+
extracted_image.save(image_file_path, "png")
|
| 93 |
+
image_file_paths.append(image_file_path)
|
| 94 |
+
|
| 95 |
+
return {
|
| 96 |
+
"page_idx": page_idx,
|
| 97 |
+
"document_name": self.document_name,
|
| 98 |
+
"file_path": self.document_file_path,
|
| 99 |
+
"file_url": self.url,
|
| 100 |
+
"image_file_paths": image_file_paths,
|
| 101 |
+
}
|
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
import fitz
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
from .base_img_loader import BaseImageLoader
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class PyMuPDFImageLoader(BaseImageLoader):
|
| 12 |
+
"""
|
| 13 |
+
`PyMuPDFImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 14 |
+
loading of pages from a PDF file as images using the pymupdf library.
|
| 15 |
+
|
| 16 |
+
This class provides functionality to extract images from a PDF file using pymupdf library,
|
| 17 |
+
and optionally publish these images to a WandB artifact.
|
| 18 |
+
|
| 19 |
+
!!! example "Example Usage"
|
| 20 |
+
```python
|
| 21 |
+
import asyncio
|
| 22 |
+
|
| 23 |
+
import weave
|
| 24 |
+
|
| 25 |
+
import wandb
|
| 26 |
+
from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
|
| 27 |
+
|
| 28 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 29 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 30 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 31 |
+
loader = PyMuPDFImageLoader(
|
| 32 |
+
url=url,
|
| 33 |
+
document_name="Gray's Anatomy",
|
| 34 |
+
document_file_path="grays_anatomy.pdf",
|
| 35 |
+
)
|
| 36 |
+
asyncio.run(
|
| 37 |
+
loader.load_data(
|
| 38 |
+
start_page=32,
|
| 39 |
+
end_page=37,
|
| 40 |
+
wandb_artifact_name="grays-anatomy-images-pymupdf",
|
| 41 |
+
cleanup=False,
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
url (str): The URL of the PDF document.
|
| 48 |
+
document_name (str): The name of the document.
|
| 49 |
+
document_file_path (str): The path to the PDF file.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 53 |
+
super().__init__(url, document_name, document_file_path)
|
| 54 |
+
|
| 55 |
+
async def extract_page_data(
|
| 56 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 57 |
+
) -> Dict[str, Any]:
|
| 58 |
+
"""
|
| 59 |
+
Extracts a single page from the PDF as an image using pymupdf library.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
page_idx (int): The index of the page to process.
|
| 63 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 64 |
+
**kwargs: Additional keyword arguments that may be used by pymupdf.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 68 |
+
The dictionary will have the following keys and values:
|
| 69 |
+
|
| 70 |
+
- "page_idx": (int) the index of the page.
|
| 71 |
+
- "document_name": (str) the name of the document.
|
| 72 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 73 |
+
- "file_url": (str) the URL of the PDF file.
|
| 74 |
+
- "image_file_paths": (list) the local file paths where the images are stored.
|
| 75 |
+
"""
|
| 76 |
+
image_file_paths = []
|
| 77 |
+
|
| 78 |
+
pdf_document = fitz.open(self.document_file_path)
|
| 79 |
+
page = pdf_document[page_idx]
|
| 80 |
+
|
| 81 |
+
images = page.get_images(full=True)
|
| 82 |
+
for img_idx, image in enumerate(images):
|
| 83 |
+
xref = image[0]
|
| 84 |
+
base_image = pdf_document.extract_image(xref)
|
| 85 |
+
image_bytes = base_image["image"]
|
| 86 |
+
image_ext = base_image["ext"]
|
| 87 |
+
|
| 88 |
+
if image_ext == "jb2":
|
| 89 |
+
image_ext = "png"
|
| 90 |
+
elif image_ext == "jpx":
|
| 91 |
+
image_ext = "jpg"
|
| 92 |
+
|
| 93 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
|
| 94 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 95 |
+
|
| 96 |
+
# For JBIG2 and JPEG2000, we need to convert the image
|
| 97 |
+
if base_image["ext"] in ["jb2", "jpx"]:
|
| 98 |
+
try:
|
| 99 |
+
pix = fitz.Pixmap(image_bytes)
|
| 100 |
+
pix.save(image_file_path)
|
| 101 |
+
except Exception as err_fitz:
|
| 102 |
+
print(f"Error processing image with fitz: {err_fitz}")
|
| 103 |
+
# Fallback to using PIL for image conversion
|
| 104 |
+
try:
|
| 105 |
+
img = Image.open(io.BytesIO(image_bytes))
|
| 106 |
+
img.save(image_file_path)
|
| 107 |
+
except Exception as err_pil:
|
| 108 |
+
print(f"Failed to process image with PIL: {err_pil}")
|
| 109 |
+
continue # Skip this image if both methods fail
|
| 110 |
+
else:
|
| 111 |
+
with open(image_file_path, "wb") as image_file:
|
| 112 |
+
image_file.write(image_bytes)
|
| 113 |
+
|
| 114 |
+
image_file_paths.append(image_file_path)
|
| 115 |
+
|
| 116 |
+
pdf_document.close()
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"page_idx": page_idx,
|
| 120 |
+
"document_name": self.document_name,
|
| 121 |
+
"file_path": self.document_file_path,
|
| 122 |
+
"file_url": self.url,
|
| 123 |
+
"image_file_paths": image_file_paths,
|
| 124 |
+
}
|
medrag_multi_modal/document_loader/load_image.py
DELETED
|
@@ -1,131 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import os
|
| 3 |
-
from typing import Optional
|
| 4 |
-
|
| 5 |
-
import rich
|
| 6 |
-
import wandb
|
| 7 |
-
import weave
|
| 8 |
-
from pdf2image.pdf2image import convert_from_path
|
| 9 |
-
from PIL import Image
|
| 10 |
-
|
| 11 |
-
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class ImageLoader(PyMuPDF4LLMTextLoader):
|
| 15 |
-
"""
|
| 16 |
-
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
|
| 17 |
-
loading of pages from a PDF file as images.
|
| 18 |
-
|
| 19 |
-
This class provides functionality to convert specific pages of a PDF document into images
|
| 20 |
-
and optionally publish these images to a Weave dataset.
|
| 21 |
-
|
| 22 |
-
!!! example "Example Usage"
|
| 23 |
-
```python
|
| 24 |
-
import asyncio
|
| 25 |
-
|
| 26 |
-
import wandb
|
| 27 |
-
from dotenv import load_dotenv
|
| 28 |
-
|
| 29 |
-
from medrag_multi_modal.document_loader import ImageLoader
|
| 30 |
-
|
| 31 |
-
load_dotenv()
|
| 32 |
-
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 33 |
-
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 34 |
-
loader = ImageLoader(
|
| 35 |
-
url=url,
|
| 36 |
-
document_name="Gray's Anatomy",
|
| 37 |
-
document_file_path="grays_anatomy.pdf",
|
| 38 |
-
)
|
| 39 |
-
asyncio.run(
|
| 40 |
-
loader.load_data(
|
| 41 |
-
start_page=31,
|
| 42 |
-
end_page=33,
|
| 43 |
-
dataset_name="grays-anatomy-images",
|
| 44 |
-
)
|
| 45 |
-
)
|
| 46 |
-
```
|
| 47 |
-
|
| 48 |
-
Args:
|
| 49 |
-
url (str): The URL of the PDF document.
|
| 50 |
-
document_name (str): The name of the document.
|
| 51 |
-
document_file_path (str): The path to the PDF file.
|
| 52 |
-
"""
|
| 53 |
-
|
| 54 |
-
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 55 |
-
super().__init__(url, document_name, document_file_path)
|
| 56 |
-
|
| 57 |
-
def extract_data_from_pdf_file(
|
| 58 |
-
self, pdf_file: str, page_number: int
|
| 59 |
-
) -> Image.Image:
|
| 60 |
-
image = convert_from_path(
|
| 61 |
-
pdf_file, first_page=page_number + 1, last_page=page_number + 1
|
| 62 |
-
)[0]
|
| 63 |
-
return image
|
| 64 |
-
|
| 65 |
-
async def load_data(
|
| 66 |
-
self,
|
| 67 |
-
start_page: Optional[int] = None,
|
| 68 |
-
end_page: Optional[int] = None,
|
| 69 |
-
image_save_dir: str = "./images",
|
| 70 |
-
dataset_name: Optional[str] = None,
|
| 71 |
-
):
|
| 72 |
-
"""
|
| 73 |
-
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
| 74 |
-
processes the images for the specified range of pages, and optionally publishes them
|
| 75 |
-
to a Weave dataset.
|
| 76 |
-
|
| 77 |
-
This function reads the specified range of pages from a PDF document, converts each page
|
| 78 |
-
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
| 79 |
-
the image and metadata for each processed page. It processes pages concurrently using
|
| 80 |
-
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
|
| 81 |
-
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
|
| 82 |
-
with the specified name.
|
| 83 |
-
|
| 84 |
-
Args:
|
| 85 |
-
start_page (Optional[int]): The starting page index (0-based) to process.
|
| 86 |
-
end_page (Optional[int]): The ending page index (0-based) to process.
|
| 87 |
-
dataset_name (Optional[str]): The name of the Weave dataset to publish the
|
| 88 |
-
processed images to. Defaults to None.
|
| 89 |
-
|
| 90 |
-
Returns:
|
| 91 |
-
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
| 92 |
-
processed page.
|
| 93 |
-
|
| 94 |
-
Raises:
|
| 95 |
-
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
| 96 |
-
page count.
|
| 97 |
-
"""
|
| 98 |
-
os.makedirs(image_save_dir, exist_ok=True)
|
| 99 |
-
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 100 |
-
pages = []
|
| 101 |
-
processed_pages_counter: int = 1
|
| 102 |
-
total_pages = end_page - start_page
|
| 103 |
-
|
| 104 |
-
async def process_page(page_idx):
|
| 105 |
-
nonlocal processed_pages_counter
|
| 106 |
-
image = convert_from_path(
|
| 107 |
-
self.document_file_path,
|
| 108 |
-
first_page=page_idx + 1,
|
| 109 |
-
last_page=page_idx + 1,
|
| 110 |
-
)[0]
|
| 111 |
-
pages.append(
|
| 112 |
-
{
|
| 113 |
-
"page_idx": page_idx,
|
| 114 |
-
"document_name": self.document_name,
|
| 115 |
-
"file_path": self.document_file_path,
|
| 116 |
-
"file_url": self.url,
|
| 117 |
-
}
|
| 118 |
-
)
|
| 119 |
-
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
|
| 120 |
-
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
| 121 |
-
processed_pages_counter += 1
|
| 122 |
-
|
| 123 |
-
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 124 |
-
for task in asyncio.as_completed(tasks):
|
| 125 |
-
await task
|
| 126 |
-
if dataset_name:
|
| 127 |
-
artifact = wandb.Artifact(name=dataset_name, type="dataset")
|
| 128 |
-
artifact.add_dir(local_path=image_save_dir)
|
| 129 |
-
artifact.save()
|
| 130 |
-
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
|
| 131 |
-
return pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
CHANGED
|
@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
|
|
| 53 |
"""
|
| 54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
| 55 |
|
| 56 |
-
Returns
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
Args:
|
| 67 |
page_idx (int): The index of the page to process.
|
|
|
|
| 53 |
"""
|
| 54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
| 55 |
|
| 56 |
+
Returns:
|
| 57 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 58 |
+
The dictionary will have the following keys and values:
|
| 59 |
+
|
| 60 |
+
- "text": (str) the extracted structured text from the page.
|
| 61 |
+
- "page_idx": (int) the index of the page.
|
| 62 |
+
- "document_name": (str) the name of the document.
|
| 63 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 64 |
+
- "file_url": (str) the URL of the PDF file.
|
| 65 |
+
- "meta": (dict) the metadata extracted from the page by marker-pdf.
|
| 66 |
|
| 67 |
Args:
|
| 68 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the extracted text from the page.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the processed page data in markdown format.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the extracted text from the page.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/retrieval/multi_modal_retrieval.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Any, Optional
|
| 3 |
|
| 4 |
-
import wandb
|
| 5 |
import weave
|
| 6 |
from byaldi import RAGMultiModalModel
|
| 7 |
from PIL import Image
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from ..utils import get_wandb_artifact
|
| 10 |
|
| 11 |
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Any, Optional
|
| 3 |
|
|
|
|
| 4 |
import weave
|
| 5 |
from byaldi import RAGMultiModalModel
|
| 6 |
from PIL import Image
|
| 7 |
|
| 8 |
+
import wandb
|
| 9 |
+
|
| 10 |
from ..utils import get_wandb_artifact
|
| 11 |
|
| 12 |
|
mkdocs.yml
CHANGED
|
@@ -69,7 +69,13 @@ nav:
|
|
| 69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
| 70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
| 71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
| 72 |
-
- Image Loader:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
- Chunking: 'chunking.md'
|
| 74 |
- Retrieval:
|
| 75 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
|
|
|
| 69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
| 70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
| 71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
| 72 |
+
- Image Loader:
|
| 73 |
+
- Base: 'document_loader/image_loader/base_img_loader.md'
|
| 74 |
+
- PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
|
| 75 |
+
- Marker: 'document_loader/image_loader/marker_img_loader.md'
|
| 76 |
+
- PDFPlumber: 'document_loader/image_loader/pdfplumber_img_loader.md'
|
| 77 |
+
- PyMuPDF: 'document_loader/image_loader/pymupdf_img_loader.md'
|
| 78 |
+
- FitzPIL: 'document_loader/image_loader/fitzpil_img_loader.md'
|
| 79 |
- Chunking: 'chunking.md'
|
| 80 |
- Retrieval:
|
| 81 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|