Spaces:
Runtime error
Runtime error
Commit
·
5c74069
1
Parent(s):
694a076
add: base image loader + pdf2img from load_image
Browse files- medrag_multi_modal/document_loader/__init__.py +2 -4
- medrag_multi_modal/document_loader/image_loader/__init__.py +4 -0
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +63 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +34 -0
- medrag_multi_modal/document_loader/load_image.py +0 -131
medrag_multi_modal/document_loader/__init__.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
from .
|
| 2 |
-
from .load_text_image import TextImageLoader
|
| 3 |
from .text_loader import (
|
| 4 |
MarkerTextLoader,
|
| 5 |
PDFPlumberTextLoader,
|
|
@@ -12,6 +11,5 @@ __all__ = [
|
|
| 12 |
"PyPDF2TextLoader",
|
| 13 |
"PDFPlumberTextLoader",
|
| 14 |
"MarkerTextLoader",
|
| 15 |
-
"
|
| 16 |
-
"TextImageLoader",
|
| 17 |
]
|
|
|
|
| 1 |
+
from .image_loader import PDF2ImageLoader
|
|
|
|
| 2 |
from .text_loader import (
|
| 3 |
MarkerTextLoader,
|
| 4 |
PDFPlumberTextLoader,
|
|
|
|
| 11 |
"PyPDF2TextLoader",
|
| 12 |
"PDFPlumberTextLoader",
|
| 13 |
"MarkerTextLoader",
|
| 14 |
+
"PDF2ImageLoader",
|
|
|
|
| 15 |
]
|
medrag_multi_modal/document_loader/image_loader/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base_img_loader import BaseImageLoader
|
| 2 |
+
from .pdf2image_img_loader import PDF2ImageLoader
|
| 3 |
+
|
| 4 |
+
__all__ = ["PDF2ImageLoader", "BaseImageLoader"]
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from abc import abstractmethod
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
|
| 6 |
+
import rich
|
| 7 |
+
|
| 8 |
+
import wandb
|
| 9 |
+
from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
|
| 10 |
+
BaseTextLoader,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseImageLoader(BaseTextLoader):
|
| 15 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 16 |
+
super().__init__(url, document_name, document_file_path)
|
| 17 |
+
|
| 18 |
+
@abstractmethod
|
| 19 |
+
async def extract_page_data(
|
| 20 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 21 |
+
) -> Dict[str, str]:
|
| 22 |
+
pass
|
| 23 |
+
|
| 24 |
+
async def load_data(
|
| 25 |
+
self,
|
| 26 |
+
start_page: Optional[int] = None,
|
| 27 |
+
end_page: Optional[int] = None,
|
| 28 |
+
wandb_artifact_name: Optional[str] = None,
|
| 29 |
+
image_save_dir: str = "./images",
|
| 30 |
+
cleanup: bool = True,
|
| 31 |
+
**kwargs,
|
| 32 |
+
) -> List[Dict[str, str]]:
|
| 33 |
+
os.makedirs(image_save_dir, exist_ok=True)
|
| 34 |
+
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 35 |
+
pages = []
|
| 36 |
+
processed_pages_counter: int = 1
|
| 37 |
+
total_pages = end_page - start_page
|
| 38 |
+
|
| 39 |
+
async def process_page(page_idx):
|
| 40 |
+
nonlocal processed_pages_counter
|
| 41 |
+
page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
|
| 42 |
+
pages.append(page_data)
|
| 43 |
+
rich.print(
|
| 44 |
+
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
| 45 |
+
)
|
| 46 |
+
processed_pages_counter += 1
|
| 47 |
+
|
| 48 |
+
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 49 |
+
for task in asyncio.as_completed(tasks):
|
| 50 |
+
await task
|
| 51 |
+
|
| 52 |
+
if wandb_artifact_name:
|
| 53 |
+
artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
|
| 54 |
+
artifact.add_dir(local_path=image_save_dir)
|
| 55 |
+
artifact.save()
|
| 56 |
+
rich.print("Artifact saved and uploaded to wandb!")
|
| 57 |
+
|
| 58 |
+
if cleanup:
|
| 59 |
+
for file in os.listdir(image_save_dir):
|
| 60 |
+
file_path = os.path.join(image_save_dir, file)
|
| 61 |
+
if os.path.isfile(file_path):
|
| 62 |
+
os.remove(file_path)
|
| 63 |
+
return pages
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
from pdf2image.pdf2image import convert_from_path
|
| 5 |
+
|
| 6 |
+
from .base_img_loader import BaseImageLoader
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PDF2ImageLoader(BaseImageLoader):
|
| 10 |
+
|
| 11 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 12 |
+
super().__init__(url, document_name, document_file_path)
|
| 13 |
+
|
| 14 |
+
async def extract_page_data(
|
| 15 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
| 16 |
+
) -> Dict[str, Any]:
|
| 17 |
+
image = convert_from_path(
|
| 18 |
+
self.document_file_path,
|
| 19 |
+
first_page=page_idx + 1,
|
| 20 |
+
last_page=page_idx + 1,
|
| 21 |
+
**kwargs,
|
| 22 |
+
)[0]
|
| 23 |
+
|
| 24 |
+
image_file_name = f"page{page_idx}.png"
|
| 25 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
| 26 |
+
image.save(image_file_path)
|
| 27 |
+
|
| 28 |
+
return {
|
| 29 |
+
"page_idx": page_idx,
|
| 30 |
+
"document_name": self.document_name,
|
| 31 |
+
"file_path": self.document_file_path,
|
| 32 |
+
"file_url": self.url,
|
| 33 |
+
"image_file_path": image_file_path,
|
| 34 |
+
}
|
medrag_multi_modal/document_loader/load_image.py
DELETED
|
@@ -1,131 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import os
|
| 3 |
-
from typing import Optional
|
| 4 |
-
|
| 5 |
-
import rich
|
| 6 |
-
import wandb
|
| 7 |
-
import weave
|
| 8 |
-
from pdf2image.pdf2image import convert_from_path
|
| 9 |
-
from PIL import Image
|
| 10 |
-
|
| 11 |
-
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class ImageLoader(PyMuPDF4LLMTextLoader):
|
| 15 |
-
"""
|
| 16 |
-
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
|
| 17 |
-
loading of pages from a PDF file as images.
|
| 18 |
-
|
| 19 |
-
This class provides functionality to convert specific pages of a PDF document into images
|
| 20 |
-
and optionally publish these images to a Weave dataset.
|
| 21 |
-
|
| 22 |
-
!!! example "Example Usage"
|
| 23 |
-
```python
|
| 24 |
-
import asyncio
|
| 25 |
-
|
| 26 |
-
import wandb
|
| 27 |
-
from dotenv import load_dotenv
|
| 28 |
-
|
| 29 |
-
from medrag_multi_modal.document_loader import ImageLoader
|
| 30 |
-
|
| 31 |
-
load_dotenv()
|
| 32 |
-
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 33 |
-
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 34 |
-
loader = ImageLoader(
|
| 35 |
-
url=url,
|
| 36 |
-
document_name="Gray's Anatomy",
|
| 37 |
-
document_file_path="grays_anatomy.pdf",
|
| 38 |
-
)
|
| 39 |
-
asyncio.run(
|
| 40 |
-
loader.load_data(
|
| 41 |
-
start_page=31,
|
| 42 |
-
end_page=33,
|
| 43 |
-
dataset_name="grays-anatomy-images",
|
| 44 |
-
)
|
| 45 |
-
)
|
| 46 |
-
```
|
| 47 |
-
|
| 48 |
-
Args:
|
| 49 |
-
url (str): The URL of the PDF document.
|
| 50 |
-
document_name (str): The name of the document.
|
| 51 |
-
document_file_path (str): The path to the PDF file.
|
| 52 |
-
"""
|
| 53 |
-
|
| 54 |
-
def __init__(self, url: str, document_name: str, document_file_path: str):
|
| 55 |
-
super().__init__(url, document_name, document_file_path)
|
| 56 |
-
|
| 57 |
-
def extract_data_from_pdf_file(
|
| 58 |
-
self, pdf_file: str, page_number: int
|
| 59 |
-
) -> Image.Image:
|
| 60 |
-
image = convert_from_path(
|
| 61 |
-
pdf_file, first_page=page_number + 1, last_page=page_number + 1
|
| 62 |
-
)[0]
|
| 63 |
-
return image
|
| 64 |
-
|
| 65 |
-
async def load_data(
|
| 66 |
-
self,
|
| 67 |
-
start_page: Optional[int] = None,
|
| 68 |
-
end_page: Optional[int] = None,
|
| 69 |
-
image_save_dir: str = "./images",
|
| 70 |
-
dataset_name: Optional[str] = None,
|
| 71 |
-
):
|
| 72 |
-
"""
|
| 73 |
-
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
| 74 |
-
processes the images for the specified range of pages, and optionally publishes them
|
| 75 |
-
to a Weave dataset.
|
| 76 |
-
|
| 77 |
-
This function reads the specified range of pages from a PDF document, converts each page
|
| 78 |
-
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
| 79 |
-
the image and metadata for each processed page. It processes pages concurrently using
|
| 80 |
-
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
|
| 81 |
-
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
|
| 82 |
-
with the specified name.
|
| 83 |
-
|
| 84 |
-
Args:
|
| 85 |
-
start_page (Optional[int]): The starting page index (0-based) to process.
|
| 86 |
-
end_page (Optional[int]): The ending page index (0-based) to process.
|
| 87 |
-
dataset_name (Optional[str]): The name of the Weave dataset to publish the
|
| 88 |
-
processed images to. Defaults to None.
|
| 89 |
-
|
| 90 |
-
Returns:
|
| 91 |
-
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
| 92 |
-
processed page.
|
| 93 |
-
|
| 94 |
-
Raises:
|
| 95 |
-
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
| 96 |
-
page count.
|
| 97 |
-
"""
|
| 98 |
-
os.makedirs(image_save_dir, exist_ok=True)
|
| 99 |
-
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 100 |
-
pages = []
|
| 101 |
-
processed_pages_counter: int = 1
|
| 102 |
-
total_pages = end_page - start_page
|
| 103 |
-
|
| 104 |
-
async def process_page(page_idx):
|
| 105 |
-
nonlocal processed_pages_counter
|
| 106 |
-
image = convert_from_path(
|
| 107 |
-
self.document_file_path,
|
| 108 |
-
first_page=page_idx + 1,
|
| 109 |
-
last_page=page_idx + 1,
|
| 110 |
-
)[0]
|
| 111 |
-
pages.append(
|
| 112 |
-
{
|
| 113 |
-
"page_idx": page_idx,
|
| 114 |
-
"document_name": self.document_name,
|
| 115 |
-
"file_path": self.document_file_path,
|
| 116 |
-
"file_url": self.url,
|
| 117 |
-
}
|
| 118 |
-
)
|
| 119 |
-
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
|
| 120 |
-
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
| 121 |
-
processed_pages_counter += 1
|
| 122 |
-
|
| 123 |
-
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 124 |
-
for task in asyncio.as_completed(tasks):
|
| 125 |
-
await task
|
| 126 |
-
if dataset_name:
|
| 127 |
-
artifact = wandb.Artifact(name=dataset_name, type="dataset")
|
| 128 |
-
artifact.add_dir(local_path=image_save_dir)
|
| 129 |
-
artifact.save()
|
| 130 |
-
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
|
| 131 |
-
return pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|