Spaces:
Paused
Paused
| """Response builder for Dots.OCR API responses. | |
| This module handles the construction and validation of OCR API responses | |
| according to the specified schema with proper error handling and metadata. | |
| Debug-mode logging is supported to surface detailed information about | |
| extraction results when troubleshooting in environments like Hugging Face. | |
| """ | |
| import logging | |
| import os | |
| import time | |
| from typing import List, Optional, Dict, Any | |
| from datetime import datetime | |
| from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField | |
| from .enhanced_field_extraction import EnhancedFieldExtractor | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| class OCRResponseBuilder: | |
| """Builds OCR API responses with proper validation and metadata.""" | |
| def __init__(self): | |
| """Initialize the response builder.""" | |
| self.field_extractor = EnhancedFieldExtractor() | |
| def build_response( | |
| self, | |
| request_id: str, | |
| media_type: str, | |
| processing_time: float, | |
| ocr_texts: List[str], | |
| page_metadata: Optional[List[Dict[str, Any]]] = None, | |
| debug: bool = False, | |
| ) -> OCRResponse: | |
| """Build a complete OCR response from extracted texts. | |
| Args: | |
| request_id: Unique request identifier | |
| media_type: Type of media processed ("image" or "pdf") | |
| processing_time: Total processing time in seconds | |
| ocr_texts: List of OCR text results (one per page) | |
| page_metadata: Optional metadata for each page | |
| debug: When True, emit detailed logs about OCR text and mapping | |
| Returns: | |
| Complete OCRResponse object | |
| """ | |
| logger.info(f"Building response for {len(ocr_texts)} pages") | |
| detections = [] | |
| # Allow configuring the OCR text snippet length via env var. Defaults to 1200. | |
| debug_snippet_len = int(os.getenv("DOTS_OCR_DEBUG_TEXT_SNIPPET_LEN", "1200")) | |
| for i, ocr_text in enumerate(ocr_texts): | |
| try: | |
| # Extract fields and MRZ data | |
| extracted_fields = self.field_extractor.extract_fields(ocr_text) | |
| mrz_data = self.field_extractor.extract_mrz(ocr_text) | |
| # In debug mode, log OCR text snippet and extracted mapping details. | |
| if debug: | |
| # Log a bounded snippet of the OCR text to avoid overwhelming logs | |
| snippet = ocr_text[:debug_snippet_len] | |
| if len(ocr_text) > debug_snippet_len: | |
| snippet += "\n...[truncated]" | |
| logger.info( | |
| f"[debug] Page {i + 1}: OCR text snippet (len={len(ocr_text)}):\n{snippet}" | |
| ) | |
| # Prepare a compact dict of non-null extracted fields | |
| non_null_fields: Dict[str, Any] = {} | |
| for fname, fval in extracted_fields.__dict__.items(): | |
| if fval is not None: | |
| non_null_fields[fname] = { | |
| "value": fval.value, | |
| "confidence": fval.confidence, | |
| "source": fval.source, | |
| } | |
| logger.info( | |
| f"[debug] Page {i + 1}: Extracted fields (non-null): {non_null_fields}" | |
| ) | |
| if mrz_data is not None: | |
| # Support both canonical and legacy attribute names | |
| raw_mrz = getattr(mrz_data, "raw_mrz", None) or getattr(mrz_data, "raw_text", None) | |
| logger.info( | |
| f"[debug] Page {i + 1}: MRZ detected — type={getattr(mrz_data, 'document_type', None) or getattr(mrz_data, 'format_type', None)}, confidence={mrz_data.confidence:.2f}" | |
| ) | |
| if raw_mrz: | |
| logger.info(f"[debug] Page {i + 1}: MRZ raw text:\n{raw_mrz}") | |
| else: | |
| logger.info(f"[debug] Page {i + 1}: No MRZ detected") | |
| # Create detection for this page | |
| detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata) | |
| detections.append(detection) | |
| logger.info(f"Page {i + 1}: {len(extracted_fields.__dict__)} fields, MRZ: {mrz_data is not None}") | |
| except Exception as e: | |
| logger.error(f"Failed to process page {i + 1}: {e}") | |
| # Create empty detection for failed page | |
| detection = self._create_empty_detection(i) | |
| detections.append(detection) | |
| # Build final response | |
| response = OCRResponse( | |
| request_id=request_id, | |
| media_type=media_type, | |
| processing_time=processing_time, | |
| detections=detections | |
| ) | |
| # Validate response | |
| self._validate_response(response) | |
| logger.info(f"Response built successfully: {len(detections)} detections") | |
| return response | |
| def _create_detection( | |
| self, | |
| extracted_fields: ExtractedFields, | |
| mrz_data: Optional[MRZData], | |
| page_index: int, | |
| page_metadata: Optional[List[Dict[str, Any]]] = None | |
| ) -> OCRDetection: | |
| """Create an OCR detection from extracted data. | |
| Args: | |
| extracted_fields: Extracted field data | |
| mrz_data: MRZ data if available | |
| page_index: Index of the page | |
| page_metadata: Optional metadata for the page | |
| Returns: | |
| OCRDetection object | |
| """ | |
| # Convert IdCardFields to ExtractedFields format expected by OCRDetection | |
| converted_fields = self._convert_fields_format(extracted_fields) | |
| # Enhance MRZ data if available | |
| enhanced_mrz = self._enhance_mrz_data(mrz_data, page_index, page_metadata) | |
| return OCRDetection( | |
| mrz_data=enhanced_mrz, | |
| extracted_fields=converted_fields | |
| ) | |
| def _convert_fields_format(self, id_card_fields) -> ExtractedFields: | |
| """Convert IdCardFields to the format expected by OCRDetection. | |
| Args: | |
| id_card_fields: IdCardFields object | |
| Returns: | |
| ExtractedFields object | |
| """ | |
| # Convert IdCardFields to ExtractedFields by mapping the fields | |
| field_dict = {} | |
| for field_name, field_value in id_card_fields.__dict__.items(): | |
| if field_value is not None: | |
| # Convert ExtractedField to dict for Pydantic validation | |
| field_dict[field_name] = field_value.dict() if hasattr(field_value, 'dict') else field_value | |
| return ExtractedFields(**field_dict) | |
| def _enhance_mrz_data( | |
| self, | |
| mrz_data: Optional[MRZData], | |
| page_index: int, | |
| page_metadata: Optional[List[Dict[str, Any]]] = None | |
| ) -> Optional[MRZData]: | |
| """Enhance MRZ data with additional context if available. | |
| Args: | |
| mrz_data: Original MRZ data | |
| page_index: Index of the page | |
| page_metadata: Optional metadata for the page | |
| Returns: | |
| Enhanced MRZ data or None | |
| """ | |
| if mrz_data is None: | |
| return None | |
| # Add page context if available | |
| if page_metadata and page_index < len(page_metadata): | |
| metadata = page_metadata[page_index] | |
| # Could add page-specific confidence adjustments here | |
| pass | |
| return mrz_data | |
| def _create_empty_detection(self, page_index: int) -> OCRDetection: | |
| """Create an empty detection for failed pages. | |
| Args: | |
| page_index: Index of the failed page | |
| Returns: | |
| Empty OCRDetection object | |
| """ | |
| logger.warning(f"Creating empty detection for failed page {page_index + 1}") | |
| return OCRDetection( | |
| mrz_data=None, | |
| extracted_fields=ExtractedFields() | |
| ) | |
| def _validate_response(self, response: OCRResponse) -> None: | |
| """Validate the response structure and data. | |
| Args: | |
| response: OCRResponse to validate | |
| Raises: | |
| ValueError: If response validation fails | |
| """ | |
| # Validate request_id | |
| if not response.request_id or len(response.request_id) == 0: | |
| raise ValueError("Request ID cannot be empty") | |
| # Validate media_type | |
| if response.media_type not in ["image", "pdf"]: | |
| raise ValueError(f"Invalid media_type: {response.media_type}") | |
| # Validate processing_time | |
| if response.processing_time < 0: | |
| raise ValueError("Processing time cannot be negative") | |
| # Validate detections | |
| if not response.detections: | |
| logger.warning("Response has no detections") | |
| # Validate each detection | |
| for i, detection in enumerate(response.detections): | |
| self._validate_detection(detection, i) | |
| logger.debug("Response validation passed") | |
| def _validate_detection(self, detection: OCRDetection, index: int) -> None: | |
| """Validate a single detection. | |
| Args: | |
| detection: OCRDetection to validate | |
| index: Index of the detection | |
| Raises: | |
| ValueError: If detection validation fails | |
| """ | |
| # Validate MRZ data if present | |
| if detection.mrz_data: | |
| self._validate_mrz_data(detection.mrz_data, index) | |
| # Validate extracted fields | |
| if detection.extracted_fields: | |
| self._validate_extracted_fields(detection.extracted_fields, index) | |
| def _validate_mrz_data(self, mrz_data: MRZData, index: int) -> None: | |
| """Validate MRZ data. | |
| Args: | |
| mrz_data: MRZ data to validate | |
| index: Index of the detection | |
| Raises: | |
| ValueError: If MRZ data validation fails | |
| """ | |
| # Support both canonical and legacy attribute names | |
| raw_text_value = getattr(mrz_data, "raw_text", None) or getattr(mrz_data, "raw_mrz", None) | |
| if not raw_text_value: | |
| raise ValueError(f"MRZ raw text cannot be empty for detection {index}") | |
| format_type_value = getattr(mrz_data, "format_type", None) or getattr(mrz_data, "document_type", None) | |
| if not format_type_value: | |
| raise ValueError(f"MRZ format type cannot be empty for detection {index}") | |
| if not (0.0 <= mrz_data.confidence <= 1.0): | |
| raise ValueError(f"MRZ confidence must be between 0.0 and 1.0 for detection {index}") | |
| def _validate_extracted_fields(self, fields: ExtractedFields, index: int) -> None: | |
| """Validate extracted fields. | |
| Args: | |
| fields: Extracted fields to validate | |
| index: Index of the detection | |
| Raises: | |
| ValueError: If fields validation fails | |
| """ | |
| # Validate each field if present | |
| for field_name, field_value in fields.__dict__.items(): | |
| if field_value is not None: | |
| if not isinstance(field_value, ExtractedField): | |
| raise ValueError(f"Field {field_name} must be ExtractedField instance for detection {index}") | |
| # Validate field content | |
| if not (0.0 <= field_value.confidence <= 1.0): | |
| raise ValueError(f"Field {field_name} confidence must be between 0.0 and 1.0 for detection {index}") | |
| def build_error_response( | |
| self, | |
| request_id: str, | |
| error_message: str, | |
| processing_time: float = 0.0 | |
| ) -> OCRResponse: | |
| """Build an error response. | |
| Args: | |
| request_id: Unique request identifier | |
| error_message: Error message | |
| processing_time: Processing time before error | |
| Returns: | |
| Error OCRResponse object | |
| """ | |
| logger.error(f"Building error response: {error_message}") | |
| return OCRResponse( | |
| request_id=request_id, | |
| media_type="image", # Default media type | |
| processing_time=processing_time, | |
| detections=[] # Empty detections for error | |
| ) | |
| # Global response builder instance | |
| _response_builder: Optional[OCRResponseBuilder] = None | |
| def get_response_builder() -> OCRResponseBuilder: | |
| """Get the global response builder instance.""" | |
| global _response_builder | |
| if _response_builder is None: | |
| _response_builder = OCRResponseBuilder() | |
| return _response_builder | |
| def build_ocr_response( | |
| request_id: str, | |
| media_type: str, | |
| processing_time: float, | |
| ocr_texts: List[str], | |
| page_metadata: Optional[List[Dict[str, Any]]] = None, | |
| debug: bool = False, | |
| ) -> OCRResponse: | |
| """Build a complete OCR response from extracted texts.""" | |
| builder = get_response_builder() | |
| return builder.build_response( | |
| request_id=request_id, | |
| media_type=media_type, | |
| processing_time=processing_time, | |
| ocr_texts=ocr_texts, | |
| page_metadata=page_metadata, | |
| debug=debug, | |
| ) | |
| def build_error_response( | |
| request_id: str, | |
| error_message: str, | |
| processing_time: float = 0.0 | |
| ) -> OCRResponse: | |
| """Build an error response.""" | |
| builder = get_response_builder() | |
| return builder.build_error_response(request_id, error_message, processing_time) | |