Spaces:
Runtime error
Runtime error
Commit
·
cc5cebc
1
Parent(s):
5c74069
add: docs for base img loader + pdf2image
Browse files- docs/document_loader/image_loader/base_img_loader.md +3 -0
- docs/document_loader/image_loader/pdf2image_img_loader.md +3 -0
- docs/document_loader/load_image.md +0 -3
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +50 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +24 -0
- mkdocs.yml +3 -1
docs/document_loader/image_loader/base_img_loader.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Load images from PDF files
|
| 2 |
+
|
| 3 |
+
::: medrag_multi_modal.document_loader.image_loader.base_img_loader
|
docs/document_loader/image_loader/pdf2image_img_loader.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load images from PDF files (using pdf2image)
|
| 2 |
+
|
| 3 |
+
::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
|
docs/document_loader/load_image.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
# Load PDF pages as images
|
| 2 |
-
|
| 3 |
-
::: medrag_multi_modal.document_loader.load_image
|
|
|
|
|
|
|
|
|
|
|
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
CHANGED
|
@@ -19,6 +19,20 @@ class BaseImageLoader(BaseTextLoader):
|
|
| 19 |
async def extract_page_data(
|
| 20 |
self, page_idx: int, image_save_dir: str, **kwargs
|
| 21 |
) -> Dict[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
pass
|
| 23 |
|
| 24 |
async def load_data(
|
|
@@ -30,6 +44,42 @@ class BaseImageLoader(BaseTextLoader):
|
|
| 30 |
cleanup: bool = True,
|
| 31 |
**kwargs,
|
| 32 |
) -> List[Dict[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
os.makedirs(image_save_dir, exist_ok=True)
|
| 34 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 35 |
pages = []
|
|
|
|
| 19 |
async def extract_page_data(
|
| 20 |
self, page_idx: int, image_save_dir: str, **kwargs
|
| 21 |
) -> Dict[str, str]:
|
| 22 |
+
"""
|
| 23 |
+
Abstract method to process a single page of the PDF and extract the image data.
|
| 24 |
+
|
| 25 |
+
Overwrite this method in the subclass to provide the actual implementation and
|
| 26 |
+
processing logic for each page of the PDF using various PDF processing libraries.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
page_idx (int): The index of the page to process.
|
| 30 |
+
image_save_dir (str): The directory to save the extracted images.
|
| 31 |
+
**kwargs: Additional keyword arguments that may be used by underlying libraries.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
| 35 |
+
"""
|
| 36 |
pass
|
| 37 |
|
| 38 |
async def load_data(
|
|
|
|
| 44 |
cleanup: bool = True,
|
| 45 |
**kwargs,
|
| 46 |
) -> List[Dict[str, str]]:
|
| 47 |
+
"""
|
| 48 |
+
Asynchronously loads images from a PDF file specified by a URL or local file path.
|
| 49 |
+
The overrided processing abstract method then processes the images,
|
| 50 |
+
and optionally publishes it to a Weave artifact.
|
| 51 |
+
|
| 52 |
+
This function downloads a PDF from a given URL if it does not already exist locally,
|
| 53 |
+
reads the specified range of pages, scans each page's content to extract images, and
|
| 54 |
+
returns a list of Page objects containing the images and metadata.
|
| 55 |
+
|
| 56 |
+
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
| 57 |
+
overriden `extract_page_data` method provides the actual implementation to process
|
| 58 |
+
each page, extract the image content from the PDF, and convert it to png format.
|
| 59 |
+
It processes pages concurrently using `asyncio` for efficiency.
|
| 60 |
+
|
| 61 |
+
If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
| 65 |
+
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
| 66 |
+
wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
|
| 67 |
+
image_save_dir (str): The directory to save the extracted images.
|
| 68 |
+
cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
|
| 69 |
+
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
|
| 73 |
+
Each dictionary will have the following keys and values:
|
| 74 |
+
|
| 75 |
+
- "page_idx": (int) the index of the page.
|
| 76 |
+
- "document_name": (str) the name of the document.
|
| 77 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 78 |
+
- "file_url": (str) the URL of the PDF file.
|
| 79 |
+
- "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
|
| 80 |
+
Raises:
|
| 81 |
+
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
| 82 |
+
"""
|
| 83 |
os.makedirs(image_save_dir, exist_ok=True)
|
| 84 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 85 |
pages = []
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
CHANGED
|
@@ -7,6 +7,19 @@ from .base_img_loader import BaseImageLoader
|
|
| 7 |
|
| 8 |
|
| 9 |
class PDF2ImageLoader(BaseImageLoader):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 12 |
super().__init__(url, document_name, document_file_path)
|
|
@@ -14,6 +27,17 @@ class PDF2ImageLoader(BaseImageLoader):
|
|
| 14 |
async def extract_page_data(
|
| 15 |
self, page_idx: int, image_save_dir: str, **kwargs
|
| 16 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
image = convert_from_path(
|
| 18 |
self.document_file_path,
|
| 19 |
first_page=page_idx + 1,
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class PDF2ImageLoader(BaseImageLoader):
|
| 10 |
+
"""
|
| 11 |
+
`PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
| 12 |
+
loading of pages from a PDF file as images using the pdf2image library.
|
| 13 |
+
|
| 14 |
+
This class provides functionality to convert specific pages of a PDF document into images
|
| 15 |
+
and optionally publish these images to a Weave artifact.
|
| 16 |
+
It is like a snapshot image version of each of the pages from the PDF.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
url (str): The URL of the PDF document.
|
| 20 |
+
document_name (str): The name of the document.
|
| 21 |
+
document_file_path (str): The path to the PDF file.
|
| 22 |
+
"""
|
| 23 |
|
| 24 |
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 25 |
super().__init__(url, document_name, document_file_path)
|
|
|
|
| 27 |
async def extract_page_data(
|
| 28 |
self, page_idx: int, image_save_dir: str, **kwargs
|
| 29 |
) -> Dict[str, Any]:
|
| 30 |
+
"""
|
| 31 |
+
Extracts a single page from the PDF as an image using pdf2image library.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
page_idx (int): The index of the page to process.
|
| 35 |
+
image_save_dir (str): The directory to save the extracted image.
|
| 36 |
+
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
| 40 |
+
"""
|
| 41 |
image = convert_from_path(
|
| 42 |
self.document_file_path,
|
| 43 |
first_page=page_idx + 1,
|
mkdocs.yml
CHANGED
|
@@ -69,7 +69,9 @@ nav:
|
|
| 69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
| 70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
| 71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
| 72 |
-
- Image Loader:
|
|
|
|
|
|
|
| 73 |
- Chunking: 'chunking.md'
|
| 74 |
- Retrieval:
|
| 75 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
|
|
|
| 69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
| 70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
| 71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
| 72 |
+
- Image Loader:
|
| 73 |
+
- Base: 'document_loader/image_loader/base_img_loader.md'
|
| 74 |
+
- PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
|
| 75 |
- Chunking: 'chunking.md'
|
| 76 |
- Retrieval:
|
| 77 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|