LiamKhoaLe's picture
Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'
ee39cc9
raw
history blame
2.12 kB
import io
from typing import List, Dict, Any
import fitz # PyMuPDF
from docx import Document
from PIL import Image
import numpy as np
from ..logger import get_logger
logger = get_logger("PARSER", __name__)
def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]:
"""
Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]}
"""
pages = []
with fitz.open(stream=b, filetype="pdf") as doc:
for i, page in enumerate(doc):
text = page.get_text("text")
images = []
for img in page.get_images(full=True):
xref = img[0]
try:
pix = fitz.Pixmap(doc, xref)
# Convert CMYK/Alpha safely
if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
# Use PNG bytes to avoid 'not enough image data'
png_bytes = pix.tobytes("png")
im = Image.open(io.BytesIO(png_bytes)).convert("RGB")
images.append(im)
except Exception as e:
logger.warning(f"Failed to extract image on page {i+1}: {e}")
finally:
try:
pix = None
except Exception:
pass
pages.append({"page_num": i + 1, "text": text, "images": images})
logger.info(f"Parsed PDF with {len(pages)} pages")
return pages
def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]:
f = io.BytesIO(b)
doc = Document(f)
text = []
images = []
for rel in doc.part.rels.values():
if "image" in rel.reltype:
data = rel.target_part.blob
try:
im = Image.open(io.BytesIO(data)).convert("RGB")
images.append(im)
except Exception:
pass
for p in doc.paragraphs:
text.append(p.text)
pages = [{"page_num": 1, "text": "\n".join(text), "images": images}]
logger.info("Parsed DOCX into single concatenated page")
return pages