Spaces:
Sleeping
Sleeping
File size: 2,120 Bytes
ee39cc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import io
from typing import List, Dict, Any
import fitz # PyMuPDF
from docx import Document
from PIL import Image
import numpy as np
from ..logger import get_logger
logger = get_logger("PARSER", __name__)
def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]:
"""
Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]}
"""
pages = []
with fitz.open(stream=b, filetype="pdf") as doc:
for i, page in enumerate(doc):
text = page.get_text("text")
images = []
for img in page.get_images(full=True):
xref = img[0]
try:
pix = fitz.Pixmap(doc, xref)
# Convert CMYK/Alpha safely
if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
# Use PNG bytes to avoid 'not enough image data'
png_bytes = pix.tobytes("png")
im = Image.open(io.BytesIO(png_bytes)).convert("RGB")
images.append(im)
except Exception as e:
logger.warning(f"Failed to extract image on page {i+1}: {e}")
finally:
try:
pix = None
except Exception:
pass
pages.append({"page_num": i + 1, "text": text, "images": images})
logger.info(f"Parsed PDF with {len(pages)} pages")
return pages
def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]:
f = io.BytesIO(b)
doc = Document(f)
text = []
images = []
for rel in doc.part.rels.values():
if "image" in rel.reltype:
data = rel.target_part.blob
try:
im = Image.open(io.BytesIO(data)).convert("RGB")
images.append(im)
except Exception:
pass
for p in doc.paragraphs:
text.append(p.text)
pages = [{"page_num": 1, "text": "\n".join(text), "images": images}]
logger.info("Parsed DOCX into single concatenated page")
return pages
|