Spaces:
Sleeping
Sleeping
| import io | |
| from typing import List, Dict, Any | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| from PIL import Image | |
| import numpy as np | |
| from ..logger import get_logger | |
| logger = get_logger("PARSER", __name__) | |
| def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]: | |
| """ | |
| Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]} | |
| """ | |
| pages = [] | |
| with fitz.open(stream=b, filetype="pdf") as doc: | |
| for i, page in enumerate(doc): | |
| text = page.get_text("text") | |
| images = [] | |
| for img in page.get_images(full=True): | |
| xref = img[0] | |
| try: | |
| pix = fitz.Pixmap(doc, xref) | |
| # Convert CMYK/Alpha safely | |
| if pix.n - pix.alpha >= 4: | |
| pix = fitz.Pixmap(fitz.csRGB, pix) | |
| # Use PNG bytes to avoid 'not enough image data' | |
| png_bytes = pix.tobytes("png") | |
| im = Image.open(io.BytesIO(png_bytes)).convert("RGB") | |
| images.append(im) | |
| except Exception as e: | |
| logger.warning(f"Failed to extract image on page {i+1}: {e}") | |
| finally: | |
| try: | |
| pix = None | |
| except Exception: | |
| pass | |
| pages.append({"page_num": i + 1, "text": text, "images": images}) | |
| logger.info(f"Parsed PDF with {len(pages)} pages") | |
| return pages | |
| def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]: | |
| f = io.BytesIO(b) | |
| doc = Document(f) | |
| text = [] | |
| images = [] | |
| for rel in doc.part.rels.values(): | |
| if "image" in rel.reltype: | |
| data = rel.target_part.blob | |
| try: | |
| im = Image.open(io.BytesIO(data)).convert("RGB") | |
| images.append(im) | |
| except Exception: | |
| pass | |
| for p in doc.paragraphs: | |
| text.append(p.text) | |
| pages = [{"page_num": 1, "text": "\n".join(text), "images": images}] | |
| logger.info("Parsed DOCX into single concatenated page") | |
| return pages | |