Spaces:
Runtime error
Runtime error
Commit
·
f37090a
1
Parent(s):
cc5cebc
chore: improve doc + code formatting
Browse files- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +8 -1
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +10 -9
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +9 -8
- medrag_multi_modal/retrieval/multi_modal_retrieval.py +2 -1
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
CHANGED
|
@@ -36,7 +36,14 @@ class PDF2ImageLoader(BaseImageLoader):
|
|
| 36 |
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
| 37 |
|
| 38 |
Returns:
|
| 39 |
-
Dict[str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
image = convert_from_path(
|
| 42 |
self.document_file_path,
|
|
|
|
| 36 |
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
| 37 |
|
| 38 |
Returns:
|
| 39 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
| 40 |
+
The dictionary will have the following keys and values:
|
| 41 |
+
|
| 42 |
+
- "page_idx": (int) the index of the page.
|
| 43 |
+
- "document_name": (str) the name of the document.
|
| 44 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 45 |
+
- "file_url": (str) the URL of the PDF file.
|
| 46 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
| 47 |
"""
|
| 48 |
image = convert_from_path(
|
| 49 |
self.document_file_path,
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
CHANGED
|
@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
|
|
| 53 |
"""
|
| 54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
| 55 |
|
| 56 |
-
Returns
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
Args:
|
| 67 |
page_idx (int): The index of the page to process.
|
|
|
|
| 53 |
"""
|
| 54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
| 55 |
|
| 56 |
+
Returns:
|
| 57 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 58 |
+
The dictionary will have the following keys and values:
|
| 59 |
+
|
| 60 |
+
- "text": (str) the extracted structured text from the page.
|
| 61 |
+
- "page_idx": (int) the index of the page.
|
| 62 |
+
- "document_name": (str) the name of the document.
|
| 63 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 64 |
+
- "file_url": (str) the URL of the PDF file.
|
| 65 |
+
- "meta": (dict) the metadata extracted from the page by marker-pdf.
|
| 66 |
|
| 67 |
Args:
|
| 68 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the extracted text from the page.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the processed page data in markdown format.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
CHANGED
|
@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
| 54 |
|
| 55 |
-
Returns
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
Args:
|
| 65 |
page_idx (int): The index of the page to process.
|
|
|
|
| 52 |
"""
|
| 53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
| 54 |
|
| 55 |
+
Returns:
|
| 56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
| 57 |
+
The dictionary will have the following keys and values:
|
| 58 |
+
|
| 59 |
+
- "text": (str) the extracted text from the page.
|
| 60 |
+
- "page_idx": (int) the index of the page.
|
| 61 |
+
- "document_name": (str) the name of the document.
|
| 62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
| 63 |
+
- "file_url": (str) the URL of the PDF file.
|
| 64 |
|
| 65 |
Args:
|
| 66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/retrieval/multi_modal_retrieval.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Any, Optional
|
| 3 |
|
| 4 |
-
import wandb
|
| 5 |
import weave
|
| 6 |
from byaldi import RAGMultiModalModel
|
| 7 |
from PIL import Image
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from ..utils import get_wandb_artifact
|
| 10 |
|
| 11 |
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Any, Optional
|
| 3 |
|
|
|
|
| 4 |
import weave
|
| 5 |
from byaldi import RAGMultiModalModel
|
| 6 |
from PIL import Image
|
| 7 |
|
| 8 |
+
import wandb
|
| 9 |
+
|
| 10 |
from ..utils import get_wandb_artifact
|
| 11 |
|
| 12 |
|