File size: 13,846 Bytes
211e423
 
 
 
5537ceb
 
 
211e423
 
 
5537ceb
211e423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5537ceb
 
211e423
 
 
 
 
 
 
 
 
5537ceb
211e423
 
 
 
 
 
 
5537ceb
 
211e423
 
 
 
 
 
 
5537ceb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211e423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5537ceb
 
211e423
 
 
5537ceb
 
 
 
 
 
 
 
211e423
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
"""Response builder for Dots.OCR API responses.

This module handles the construction and validation of OCR API responses
according to the specified schema with proper error handling and metadata.

Debug-mode logging is supported to surface detailed information about
extraction results when troubleshooting in environments like Hugging Face.
"""

import logging
import os
import time
from typing import List, Optional, Dict, Any
from datetime import datetime

from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
from .enhanced_field_extraction import EnhancedFieldExtractor

# Configure logging
logger = logging.getLogger(__name__)


class OCRResponseBuilder:
    """Builds OCR API responses with proper validation and metadata."""
    
    def __init__(self):
        """Initialize the response builder."""
        self.field_extractor = EnhancedFieldExtractor()
    
    def build_response(
        self,
        request_id: str,
        media_type: str,
        processing_time: float,
        ocr_texts: List[str],
        page_metadata: Optional[List[Dict[str, Any]]] = None,
        debug: bool = False,
    ) -> OCRResponse:
        """Build a complete OCR response from extracted texts.
        
        Args:
            request_id: Unique request identifier
            media_type: Type of media processed ("image" or "pdf")
            processing_time: Total processing time in seconds
            ocr_texts: List of OCR text results (one per page)
            page_metadata: Optional metadata for each page
            debug: When True, emit detailed logs about OCR text and mapping
            
        Returns:
            Complete OCRResponse object
        """
        logger.info(f"Building response for {len(ocr_texts)} pages")
        
        detections = []
        # Allow configuring the OCR text snippet length via env var. Defaults to 1200.
        debug_snippet_len = int(os.getenv("DOTS_OCR_DEBUG_TEXT_SNIPPET_LEN", "1200"))
        
        for i, ocr_text in enumerate(ocr_texts):
            try:
                # Extract fields and MRZ data
                extracted_fields = self.field_extractor.extract_fields(ocr_text)
                mrz_data = self.field_extractor.extract_mrz(ocr_text)
                
                # In debug mode, log OCR text snippet and extracted mapping details.
                if debug:
                    # Log a bounded snippet of the OCR text to avoid overwhelming logs
                    snippet = ocr_text[:debug_snippet_len]
                    if len(ocr_text) > debug_snippet_len:
                        snippet += "\n...[truncated]"
                    logger.info(
                        f"[debug] Page {i + 1}: OCR text snippet (len={len(ocr_text)}):\n{snippet}"
                    )

                    # Prepare a compact dict of non-null extracted fields
                    non_null_fields: Dict[str, Any] = {}
                    for fname, fval in extracted_fields.__dict__.items():
                        if fval is not None:
                            non_null_fields[fname] = {
                                "value": fval.value,
                                "confidence": fval.confidence,
                                "source": fval.source,
                            }
                    logger.info(
                        f"[debug] Page {i + 1}: Extracted fields (non-null): {non_null_fields}"
                    )

                    if mrz_data is not None:
                        # Support both canonical and legacy attribute names
                        raw_mrz = getattr(mrz_data, "raw_mrz", None) or getattr(mrz_data, "raw_text", None)
                        logger.info(
                            f"[debug] Page {i + 1}: MRZ detected — type={getattr(mrz_data, 'document_type', None) or getattr(mrz_data, 'format_type', None)}, confidence={mrz_data.confidence:.2f}"
                        )
                        if raw_mrz:
                            logger.info(f"[debug] Page {i + 1}: MRZ raw text:\n{raw_mrz}")
                    else:
                        logger.info(f"[debug] Page {i + 1}: No MRZ detected")

                # Create detection for this page
                detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
                detections.append(detection)
                
                logger.info(f"Page {i + 1}: {len(extracted_fields.__dict__)} fields, MRZ: {mrz_data is not None}")
                
            except Exception as e:
                logger.error(f"Failed to process page {i + 1}: {e}")
                # Create empty detection for failed page
                detection = self._create_empty_detection(i)
                detections.append(detection)
        
        # Build final response
        response = OCRResponse(
            request_id=request_id,
            media_type=media_type,
            processing_time=processing_time,
            detections=detections
        )
        
        # Validate response
        self._validate_response(response)
        
        logger.info(f"Response built successfully: {len(detections)} detections")
        return response
    
    def _create_detection(
        self, 
        extracted_fields: ExtractedFields, 
        mrz_data: Optional[MRZData], 
        page_index: int,
        page_metadata: Optional[List[Dict[str, Any]]] = None
    ) -> OCRDetection:
        """Create an OCR detection from extracted data.
        
        Args:
            extracted_fields: Extracted field data
            mrz_data: MRZ data if available
            page_index: Index of the page
            page_metadata: Optional metadata for the page
            
        Returns:
            OCRDetection object
        """
        # Convert IdCardFields to ExtractedFields format expected by OCRDetection
        converted_fields = self._convert_fields_format(extracted_fields)
        
        # Enhance MRZ data if available
        enhanced_mrz = self._enhance_mrz_data(mrz_data, page_index, page_metadata)
        
        return OCRDetection(
            mrz_data=enhanced_mrz,
            extracted_fields=converted_fields
        )
    
    def _convert_fields_format(self, id_card_fields) -> ExtractedFields:
        """Convert IdCardFields to the format expected by OCRDetection.
        
        Args:
            id_card_fields: IdCardFields object
            
        Returns:
            ExtractedFields object
        """
        # Convert IdCardFields to ExtractedFields by mapping the fields
        field_dict = {}
        
        for field_name, field_value in id_card_fields.__dict__.items():
            if field_value is not None:
                # Convert ExtractedField to dict for Pydantic validation
                field_dict[field_name] = field_value.dict() if hasattr(field_value, 'dict') else field_value
        
        return ExtractedFields(**field_dict)
    
    def _enhance_mrz_data(
        self, 
        mrz_data: Optional[MRZData], 
        page_index: int,
        page_metadata: Optional[List[Dict[str, Any]]] = None
    ) -> Optional[MRZData]:
        """Enhance MRZ data with additional context if available.
        
        Args:
            mrz_data: Original MRZ data
            page_index: Index of the page
            page_metadata: Optional metadata for the page
            
        Returns:
            Enhanced MRZ data or None
        """
        if mrz_data is None:
            return None
        
        # Add page context if available
        if page_metadata and page_index < len(page_metadata):
            metadata = page_metadata[page_index]
            # Could add page-specific confidence adjustments here
            pass
        
        return mrz_data
    
    def _create_empty_detection(self, page_index: int) -> OCRDetection:
        """Create an empty detection for failed pages.
        
        Args:
            page_index: Index of the failed page
            
        Returns:
            Empty OCRDetection object
        """
        logger.warning(f"Creating empty detection for failed page {page_index + 1}")
        
        return OCRDetection(
            mrz_data=None,
            extracted_fields=ExtractedFields()
        )
    
    def _validate_response(self, response: OCRResponse) -> None:
        """Validate the response structure and data.
        
        Args:
            response: OCRResponse to validate
            
        Raises:
            ValueError: If response validation fails
        """
        # Validate request_id
        if not response.request_id or len(response.request_id) == 0:
            raise ValueError("Request ID cannot be empty")
        
        # Validate media_type
        if response.media_type not in ["image", "pdf"]:
            raise ValueError(f"Invalid media_type: {response.media_type}")
        
        # Validate processing_time
        if response.processing_time < 0:
            raise ValueError("Processing time cannot be negative")
        
        # Validate detections
        if not response.detections:
            logger.warning("Response has no detections")
        
        # Validate each detection
        for i, detection in enumerate(response.detections):
            self._validate_detection(detection, i)
        
        logger.debug("Response validation passed")
    
    def _validate_detection(self, detection: OCRDetection, index: int) -> None:
        """Validate a single detection.
        
        Args:
            detection: OCRDetection to validate
            index: Index of the detection
            
        Raises:
            ValueError: If detection validation fails
        """
        # Validate MRZ data if present
        if detection.mrz_data:
            self._validate_mrz_data(detection.mrz_data, index)
        
        # Validate extracted fields
        if detection.extracted_fields:
            self._validate_extracted_fields(detection.extracted_fields, index)
    
    def _validate_mrz_data(self, mrz_data: MRZData, index: int) -> None:
        """Validate MRZ data.
        
        Args:
            mrz_data: MRZ data to validate
            index: Index of the detection
            
        Raises:
            ValueError: If MRZ data validation fails
        """
        # Support both canonical and legacy attribute names
        raw_text_value = getattr(mrz_data, "raw_text", None) or getattr(mrz_data, "raw_mrz", None)
        if not raw_text_value:
            raise ValueError(f"MRZ raw text cannot be empty for detection {index}")
        
        format_type_value = getattr(mrz_data, "format_type", None) or getattr(mrz_data, "document_type", None)
        if not format_type_value:
            raise ValueError(f"MRZ format type cannot be empty for detection {index}")
        
        if not (0.0 <= mrz_data.confidence <= 1.0):
            raise ValueError(f"MRZ confidence must be between 0.0 and 1.0 for detection {index}")
    
    def _validate_extracted_fields(self, fields: ExtractedFields, index: int) -> None:
        """Validate extracted fields.
        
        Args:
            fields: Extracted fields to validate
            index: Index of the detection
            
        Raises:
            ValueError: If fields validation fails
        """
        # Validate each field if present
        for field_name, field_value in fields.__dict__.items():
            if field_value is not None:
                if not isinstance(field_value, ExtractedField):
                    raise ValueError(f"Field {field_name} must be ExtractedField instance for detection {index}")
                
                # Validate field content
                if not (0.0 <= field_value.confidence <= 1.0):
                    raise ValueError(f"Field {field_name} confidence must be between 0.0 and 1.0 for detection {index}")
    
    def build_error_response(
        self,
        request_id: str,
        error_message: str,
        processing_time: float = 0.0
    ) -> OCRResponse:
        """Build an error response.
        
        Args:
            request_id: Unique request identifier
            error_message: Error message
            processing_time: Processing time before error
            
        Returns:
            Error OCRResponse object
        """
        logger.error(f"Building error response: {error_message}")
        
        return OCRResponse(
            request_id=request_id,
            media_type="image",  # Default media type
            processing_time=processing_time,
            detections=[]  # Empty detections for error
        )


# Global response builder instance
_response_builder: Optional[OCRResponseBuilder] = None


def get_response_builder() -> OCRResponseBuilder:
    """Get the global response builder instance."""
    global _response_builder
    if _response_builder is None:
        _response_builder = OCRResponseBuilder()
    return _response_builder


def build_ocr_response(
    request_id: str,
    media_type: str,
    processing_time: float,
    ocr_texts: List[str],
    page_metadata: Optional[List[Dict[str, Any]]] = None,
    debug: bool = False,
) -> OCRResponse:
    """Build a complete OCR response from extracted texts."""
    builder = get_response_builder()
    return builder.build_response(
        request_id=request_id,
        media_type=media_type,
        processing_time=processing_time,
        ocr_texts=ocr_texts,
        page_metadata=page_metadata,
        debug=debug,
    )


def build_error_response(
    request_id: str,
    error_message: str,
    processing_time: float = 0.0
) -> OCRResponse:
    """Build an error response."""
    builder = get_response_builder()
    return builder.build_error_response(request_id, error_message, processing_time)