Spaces:

algoryn
/

dots-ocr-idcard

Paused

File size: 11,183 Bytes

"""Image and PDF preprocessing utilities for Dots.OCR.

This module handles PDF to image conversion, image preprocessing,
and multi-page document processing for the Dots.OCR model.
"""

import os
import logging
from typing import List, Tuple, Optional, Union
from pathlib import Path
import io

import fitz  # PyMuPDF
import numpy as np
from PIL import Image, ImageOps
import cv2

# Configure logging
logger = logging.getLogger(__name__)

# Environment variable configuration
PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
IMAGE_MAX_SIZE = (
    int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024
)  # 10MB default


class ImagePreprocessor:
    """Handles image preprocessing for Dots.OCR model."""

    def __init__(
        self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28
    ):
        """Initialize the image preprocessor.

        Args:
            min_pixels: Minimum pixel count for images
            max_pixels: Maximum pixel count for images
            divisor: Required divisor for image dimensions
        """
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.divisor = divisor

    def preprocess_image(self, image: Image.Image) -> Image.Image:
        """Preprocess an image to meet model requirements.

        Args:
            image: Input PIL Image

        Returns:
            Preprocessed PIL Image
        """
        # Convert to RGB if necessary
        if image.mode != "RGB":
            image = image.convert("RGB")

        # Auto-orient image based on EXIF data
        image = ImageOps.exif_transpose(image)

        # Calculate current pixel count
        width, height = image.size
        current_pixels = width * height

        logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")

        # Resize if necessary to meet pixel requirements
        if current_pixels < self.min_pixels:
            # Scale up to meet minimum pixel requirement
            scale_factor = (self.min_pixels / current_pixels) ** 0.5
            new_width = int(width * scale_factor)
            new_height = int(height * scale_factor)
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            logger.info(f"Scaled up image to {new_width}x{new_height}")

        elif current_pixels > self.max_pixels:
            # Scale down to meet maximum pixel requirement
            scale_factor = (self.max_pixels / current_pixels) ** 0.5
            new_width = int(width * scale_factor)
            new_height = int(height * scale_factor)
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            logger.info(f"Scaled down image to {new_width}x{new_height}")

        # Ensure dimensions are divisible by the required divisor
        width, height = image.size
        new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
        new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor

        if new_width != width or new_height != height:
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            logger.info(
                f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}"
            )

        return image

    def crop_by_roi(
        self, image: Image.Image, roi: Tuple[float, float, float, float]
    ) -> Image.Image:
        """Crop image using ROI coordinates.

        Args:
            image: Input PIL Image
            roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]

        Returns:
            Cropped PIL Image
        """
        x1, y1, x2, y2 = roi
        width, height = image.size

        # Convert normalized coordinates to pixel coordinates
        x1_px = int(x1 * width)
        y1_px = int(y1 * height)
        x2_px = int(x2 * width)
        y2_px = int(y2 * height)

        # Ensure coordinates are within image bounds
        x1_px = max(0, min(x1_px, width))
        y1_px = max(0, min(y1_px, height))
        x2_px = max(x1_px, min(x2_px, width))
        y2_px = max(y1_px, min(y2_px, height))

        # Crop the image
        cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
        logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")

        return cropped


class PDFProcessor:
    """Handles PDF to image conversion and multi-page processing."""

    def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
        """Initialize the PDF processor.

        Args:
            dpi: DPI for PDF to image conversion
            max_pages: Maximum number of pages to process
        """
        self.dpi = dpi
        self.max_pages = max_pages

    def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
        """Convert PDF to list of images.

        Args:
            pdf_data: PDF file data as bytes

        Returns:
            List of PIL Images, one per page
        """
        try:
            # Open PDF from bytes
            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
            images = []

            # Limit number of pages to process
            num_pages = min(len(pdf_document), self.max_pages)
            logger.info(f"Processing {num_pages} pages from PDF")

            for page_num in range(num_pages):
                page = pdf_document[page_num]

                # Convert page to image
                mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)  # 72 is default DPI
                pix = page.get_pixmap(matrix=mat)

                # Convert to PIL Image
                img_data = pix.tobytes("png")
                image = Image.open(io.BytesIO(img_data))
                images.append(image)

                logger.info(f"Converted page {page_num + 1} to image: {image.size}")

            pdf_document.close()
            return images

        except Exception as e:
            logger.error(f"Failed to convert PDF to images: {e}")
            raise RuntimeError(f"PDF conversion failed: {e}")

    def is_pdf(self, file_data: bytes) -> bool:
        """Check if file data is a PDF.

        Args:
            file_data: File data as bytes

        Returns:
            True if file is a PDF
        """
        return file_data.startswith(b"%PDF-")

    def get_pdf_page_count(self, pdf_data: bytes) -> int:
        """Get the number of pages in a PDF.

        Args:
            pdf_data: PDF file data as bytes

        Returns:
            Number of pages in the PDF
        """
        try:
            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
            page_count = len(pdf_document)
            pdf_document.close()
            return page_count
        except Exception as e:
            logger.error(f"Failed to get PDF page count: {e}")
            return 0


class DocumentProcessor:
    """Main document processing class that handles both images and PDFs."""

    def __init__(self):
        """Initialize the document processor."""
        self.image_preprocessor = ImagePreprocessor()
        self.pdf_processor = PDFProcessor()

    def process_document(
        self, file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
    ) -> List[Image.Image]:
        """Process a document (image or PDF) and return preprocessed images.

        Args:
            file_data: Document file data as bytes
            roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]

        Returns:
            List of preprocessed PIL Images
        """
        # Check if it's a PDF
        if self.pdf_processor.is_pdf(file_data):
            logger.info("Processing PDF document")
            images = self.pdf_processor.pdf_to_images(file_data)
        else:
            # Process as image
            logger.info("Processing image document")
            try:
                image = Image.open(io.BytesIO(file_data))
                images = [image]
            except Exception as e:
                logger.error(f"Failed to open image: {e}")
                raise RuntimeError(f"Image processing failed: {e}")

        # Preprocess each image
        processed_images = []
        for i, image in enumerate(images):
            try:
                # Apply ROI cropping if provided
                if roi is not None:
                    image = self.image_preprocessor.crop_by_roi(image, roi)

                # Preprocess image for model requirements
                processed_image = self.image_preprocessor.preprocess_image(image)
                processed_images.append(processed_image)

                logger.info(f"Processed image {i + 1}: {processed_image.size}")

            except Exception as e:
                logger.error(f"Failed to preprocess image {i + 1}: {e}")
                # Continue with other images even if one fails
                continue

        if not processed_images:
            raise RuntimeError("No images could be processed from the document")

        logger.info(f"Successfully processed {len(processed_images)} images")
        return processed_images

    def validate_file_size(self, file_data: bytes) -> bool:
        """Validate that file size is within limits.

        Args:
            file_data: File data as bytes

        Returns:
            True if file size is acceptable
        """
        file_size = len(file_data)
        if file_size > IMAGE_MAX_SIZE:
            logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
            return False
        return True

    def get_document_info(self, file_data: bytes) -> dict:
        """Get information about the document.

        Args:
            file_data: Document file data as bytes

        Returns:
            Dictionary with document information
        """
        info = {
            "file_size": len(file_data),
            "is_pdf": self.pdf_processor.is_pdf(file_data),
            "page_count": 1,
        }

        if info["is_pdf"]:
            info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)

        return info


# Global document processor instance
_document_processor: Optional[DocumentProcessor] = None


def get_document_processor() -> DocumentProcessor:
    """Get the global document processor instance."""
    global _document_processor
    if _document_processor is None:
        _document_processor = DocumentProcessor()
    return _document_processor


def process_document(
    file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
) -> List[Image.Image]:
    """Process a document and return preprocessed images."""
    processor = get_document_processor()
    return processor.process_document(file_data, roi)


def validate_file_size(file_data: bytes) -> bool:
    """Validate that file size is within limits."""
    processor = get_document_processor()
    return processor.validate_file_size(file_data)


def get_document_info(file_data: bytes) -> dict:
    """Get information about the document."""
    processor = get_document_processor()
    return processor.get_document_info(file_data)