tommulder's picture
style: format Python files with Black
5537ceb
"""Image and PDF preprocessing utilities for Dots.OCR.
This module handles PDF to image conversion, image preprocessing,
and multi-page document processing for the Dots.OCR model.
"""
import os
import logging
from typing import List, Tuple, Optional, Union
from pathlib import Path
import io
import fitz # PyMuPDF
import numpy as np
from PIL import Image, ImageOps
import cv2
# Configure logging
logger = logging.getLogger(__name__)
# Environment variable configuration
PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
IMAGE_MAX_SIZE = (
int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024
) # 10MB default
class ImagePreprocessor:
"""Handles image preprocessing for Dots.OCR model."""
def __init__(
self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28
):
"""Initialize the image preprocessor.
Args:
min_pixels: Minimum pixel count for images
max_pixels: Maximum pixel count for images
divisor: Required divisor for image dimensions
"""
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.divisor = divisor
def preprocess_image(self, image: Image.Image) -> Image.Image:
"""Preprocess an image to meet model requirements.
Args:
image: Input PIL Image
Returns:
Preprocessed PIL Image
"""
# Convert to RGB if necessary
if image.mode != "RGB":
image = image.convert("RGB")
# Auto-orient image based on EXIF data
image = ImageOps.exif_transpose(image)
# Calculate current pixel count
width, height = image.size
current_pixels = width * height
logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
# Resize if necessary to meet pixel requirements
if current_pixels < self.min_pixels:
# Scale up to meet minimum pixel requirement
scale_factor = (self.min_pixels / current_pixels) ** 0.5
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
logger.info(f"Scaled up image to {new_width}x{new_height}")
elif current_pixels > self.max_pixels:
# Scale down to meet maximum pixel requirement
scale_factor = (self.max_pixels / current_pixels) ** 0.5
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
logger.info(f"Scaled down image to {new_width}x{new_height}")
# Ensure dimensions are divisible by the required divisor
width, height = image.size
new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
if new_width != width or new_height != height:
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
logger.info(
f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}"
)
return image
def crop_by_roi(
self, image: Image.Image, roi: Tuple[float, float, float, float]
) -> Image.Image:
"""Crop image using ROI coordinates.
Args:
image: Input PIL Image
roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
Returns:
Cropped PIL Image
"""
x1, y1, x2, y2 = roi
width, height = image.size
# Convert normalized coordinates to pixel coordinates
x1_px = int(x1 * width)
y1_px = int(y1 * height)
x2_px = int(x2 * width)
y2_px = int(y2 * height)
# Ensure coordinates are within image bounds
x1_px = max(0, min(x1_px, width))
y1_px = max(0, min(y1_px, height))
x2_px = max(x1_px, min(x2_px, width))
y2_px = max(y1_px, min(y2_px, height))
# Crop the image
cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
return cropped
class PDFProcessor:
"""Handles PDF to image conversion and multi-page processing."""
def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
"""Initialize the PDF processor.
Args:
dpi: DPI for PDF to image conversion
max_pages: Maximum number of pages to process
"""
self.dpi = dpi
self.max_pages = max_pages
def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
"""Convert PDF to list of images.
Args:
pdf_data: PDF file data as bytes
Returns:
List of PIL Images, one per page
"""
try:
# Open PDF from bytes
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
images = []
# Limit number of pages to process
num_pages = min(len(pdf_document), self.max_pages)
logger.info(f"Processing {num_pages} pages from PDF")
for page_num in range(num_pages):
page = pdf_document[page_num]
# Convert page to image
mat = fitz.Matrix(self.dpi / 72, self.dpi / 72) # 72 is default DPI
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
images.append(image)
logger.info(f"Converted page {page_num + 1} to image: {image.size}")
pdf_document.close()
return images
except Exception as e:
logger.error(f"Failed to convert PDF to images: {e}")
raise RuntimeError(f"PDF conversion failed: {e}")
def is_pdf(self, file_data: bytes) -> bool:
"""Check if file data is a PDF.
Args:
file_data: File data as bytes
Returns:
True if file is a PDF
"""
return file_data.startswith(b"%PDF-")
def get_pdf_page_count(self, pdf_data: bytes) -> int:
"""Get the number of pages in a PDF.
Args:
pdf_data: PDF file data as bytes
Returns:
Number of pages in the PDF
"""
try:
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page_count = len(pdf_document)
pdf_document.close()
return page_count
except Exception as e:
logger.error(f"Failed to get PDF page count: {e}")
return 0
class DocumentProcessor:
"""Main document processing class that handles both images and PDFs."""
def __init__(self):
"""Initialize the document processor."""
self.image_preprocessor = ImagePreprocessor()
self.pdf_processor = PDFProcessor()
def process_document(
self, file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
) -> List[Image.Image]:
"""Process a document (image or PDF) and return preprocessed images.
Args:
file_data: Document file data as bytes
roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
Returns:
List of preprocessed PIL Images
"""
# Check if it's a PDF
if self.pdf_processor.is_pdf(file_data):
logger.info("Processing PDF document")
images = self.pdf_processor.pdf_to_images(file_data)
else:
# Process as image
logger.info("Processing image document")
try:
image = Image.open(io.BytesIO(file_data))
images = [image]
except Exception as e:
logger.error(f"Failed to open image: {e}")
raise RuntimeError(f"Image processing failed: {e}")
# Preprocess each image
processed_images = []
for i, image in enumerate(images):
try:
# Apply ROI cropping if provided
if roi is not None:
image = self.image_preprocessor.crop_by_roi(image, roi)
# Preprocess image for model requirements
processed_image = self.image_preprocessor.preprocess_image(image)
processed_images.append(processed_image)
logger.info(f"Processed image {i + 1}: {processed_image.size}")
except Exception as e:
logger.error(f"Failed to preprocess image {i + 1}: {e}")
# Continue with other images even if one fails
continue
if not processed_images:
raise RuntimeError("No images could be processed from the document")
logger.info(f"Successfully processed {len(processed_images)} images")
return processed_images
def validate_file_size(self, file_data: bytes) -> bool:
"""Validate that file size is within limits.
Args:
file_data: File data as bytes
Returns:
True if file size is acceptable
"""
file_size = len(file_data)
if file_size > IMAGE_MAX_SIZE:
logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
return False
return True
def get_document_info(self, file_data: bytes) -> dict:
"""Get information about the document.
Args:
file_data: Document file data as bytes
Returns:
Dictionary with document information
"""
info = {
"file_size": len(file_data),
"is_pdf": self.pdf_processor.is_pdf(file_data),
"page_count": 1,
}
if info["is_pdf"]:
info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
return info
# Global document processor instance
_document_processor: Optional[DocumentProcessor] = None
def get_document_processor() -> DocumentProcessor:
"""Get the global document processor instance."""
global _document_processor
if _document_processor is None:
_document_processor = DocumentProcessor()
return _document_processor
def process_document(
file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
) -> List[Image.Image]:
"""Process a document and return preprocessed images."""
processor = get_document_processor()
return processor.process_document(file_data, roi)
def validate_file_size(file_data: bytes) -> bool:
"""Validate that file size is within limits."""
processor = get_document_processor()
return processor.validate_file_size(file_data)
def get_document_info(file_data: bytes) -> dict:
"""Get information about the document."""
processor = get_document_processor()
return processor.get_document_info(file_data)