Spaces:
Paused
Paused
File size: 13,846 Bytes
211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 5537ceb 211e423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
"""Response builder for Dots.OCR API responses.
This module handles the construction and validation of OCR API responses
according to the specified schema with proper error handling and metadata.
Debug-mode logging is supported to surface detailed information about
extraction results when troubleshooting in environments like Hugging Face.
"""
import logging
import os
import time
from typing import List, Optional, Dict, Any
from datetime import datetime
from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
from .enhanced_field_extraction import EnhancedFieldExtractor
# Configure logging
logger = logging.getLogger(__name__)
class OCRResponseBuilder:
"""Builds OCR API responses with proper validation and metadata."""
def __init__(self):
"""Initialize the response builder."""
self.field_extractor = EnhancedFieldExtractor()
def build_response(
self,
request_id: str,
media_type: str,
processing_time: float,
ocr_texts: List[str],
page_metadata: Optional[List[Dict[str, Any]]] = None,
debug: bool = False,
) -> OCRResponse:
"""Build a complete OCR response from extracted texts.
Args:
request_id: Unique request identifier
media_type: Type of media processed ("image" or "pdf")
processing_time: Total processing time in seconds
ocr_texts: List of OCR text results (one per page)
page_metadata: Optional metadata for each page
debug: When True, emit detailed logs about OCR text and mapping
Returns:
Complete OCRResponse object
"""
logger.info(f"Building response for {len(ocr_texts)} pages")
detections = []
# Allow configuring the OCR text snippet length via env var. Defaults to 1200.
debug_snippet_len = int(os.getenv("DOTS_OCR_DEBUG_TEXT_SNIPPET_LEN", "1200"))
for i, ocr_text in enumerate(ocr_texts):
try:
# Extract fields and MRZ data
extracted_fields = self.field_extractor.extract_fields(ocr_text)
mrz_data = self.field_extractor.extract_mrz(ocr_text)
# In debug mode, log OCR text snippet and extracted mapping details.
if debug:
# Log a bounded snippet of the OCR text to avoid overwhelming logs
snippet = ocr_text[:debug_snippet_len]
if len(ocr_text) > debug_snippet_len:
snippet += "\n...[truncated]"
logger.info(
f"[debug] Page {i + 1}: OCR text snippet (len={len(ocr_text)}):\n{snippet}"
)
# Prepare a compact dict of non-null extracted fields
non_null_fields: Dict[str, Any] = {}
for fname, fval in extracted_fields.__dict__.items():
if fval is not None:
non_null_fields[fname] = {
"value": fval.value,
"confidence": fval.confidence,
"source": fval.source,
}
logger.info(
f"[debug] Page {i + 1}: Extracted fields (non-null): {non_null_fields}"
)
if mrz_data is not None:
# Support both canonical and legacy attribute names
raw_mrz = getattr(mrz_data, "raw_mrz", None) or getattr(mrz_data, "raw_text", None)
logger.info(
f"[debug] Page {i + 1}: MRZ detected — type={getattr(mrz_data, 'document_type', None) or getattr(mrz_data, 'format_type', None)}, confidence={mrz_data.confidence:.2f}"
)
if raw_mrz:
logger.info(f"[debug] Page {i + 1}: MRZ raw text:\n{raw_mrz}")
else:
logger.info(f"[debug] Page {i + 1}: No MRZ detected")
# Create detection for this page
detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
detections.append(detection)
logger.info(f"Page {i + 1}: {len(extracted_fields.__dict__)} fields, MRZ: {mrz_data is not None}")
except Exception as e:
logger.error(f"Failed to process page {i + 1}: {e}")
# Create empty detection for failed page
detection = self._create_empty_detection(i)
detections.append(detection)
# Build final response
response = OCRResponse(
request_id=request_id,
media_type=media_type,
processing_time=processing_time,
detections=detections
)
# Validate response
self._validate_response(response)
logger.info(f"Response built successfully: {len(detections)} detections")
return response
def _create_detection(
self,
extracted_fields: ExtractedFields,
mrz_data: Optional[MRZData],
page_index: int,
page_metadata: Optional[List[Dict[str, Any]]] = None
) -> OCRDetection:
"""Create an OCR detection from extracted data.
Args:
extracted_fields: Extracted field data
mrz_data: MRZ data if available
page_index: Index of the page
page_metadata: Optional metadata for the page
Returns:
OCRDetection object
"""
# Convert IdCardFields to ExtractedFields format expected by OCRDetection
converted_fields = self._convert_fields_format(extracted_fields)
# Enhance MRZ data if available
enhanced_mrz = self._enhance_mrz_data(mrz_data, page_index, page_metadata)
return OCRDetection(
mrz_data=enhanced_mrz,
extracted_fields=converted_fields
)
def _convert_fields_format(self, id_card_fields) -> ExtractedFields:
"""Convert IdCardFields to the format expected by OCRDetection.
Args:
id_card_fields: IdCardFields object
Returns:
ExtractedFields object
"""
# Convert IdCardFields to ExtractedFields by mapping the fields
field_dict = {}
for field_name, field_value in id_card_fields.__dict__.items():
if field_value is not None:
# Convert ExtractedField to dict for Pydantic validation
field_dict[field_name] = field_value.dict() if hasattr(field_value, 'dict') else field_value
return ExtractedFields(**field_dict)
def _enhance_mrz_data(
self,
mrz_data: Optional[MRZData],
page_index: int,
page_metadata: Optional[List[Dict[str, Any]]] = None
) -> Optional[MRZData]:
"""Enhance MRZ data with additional context if available.
Args:
mrz_data: Original MRZ data
page_index: Index of the page
page_metadata: Optional metadata for the page
Returns:
Enhanced MRZ data or None
"""
if mrz_data is None:
return None
# Add page context if available
if page_metadata and page_index < len(page_metadata):
metadata = page_metadata[page_index]
# Could add page-specific confidence adjustments here
pass
return mrz_data
def _create_empty_detection(self, page_index: int) -> OCRDetection:
"""Create an empty detection for failed pages.
Args:
page_index: Index of the failed page
Returns:
Empty OCRDetection object
"""
logger.warning(f"Creating empty detection for failed page {page_index + 1}")
return OCRDetection(
mrz_data=None,
extracted_fields=ExtractedFields()
)
def _validate_response(self, response: OCRResponse) -> None:
"""Validate the response structure and data.
Args:
response: OCRResponse to validate
Raises:
ValueError: If response validation fails
"""
# Validate request_id
if not response.request_id or len(response.request_id) == 0:
raise ValueError("Request ID cannot be empty")
# Validate media_type
if response.media_type not in ["image", "pdf"]:
raise ValueError(f"Invalid media_type: {response.media_type}")
# Validate processing_time
if response.processing_time < 0:
raise ValueError("Processing time cannot be negative")
# Validate detections
if not response.detections:
logger.warning("Response has no detections")
# Validate each detection
for i, detection in enumerate(response.detections):
self._validate_detection(detection, i)
logger.debug("Response validation passed")
def _validate_detection(self, detection: OCRDetection, index: int) -> None:
"""Validate a single detection.
Args:
detection: OCRDetection to validate
index: Index of the detection
Raises:
ValueError: If detection validation fails
"""
# Validate MRZ data if present
if detection.mrz_data:
self._validate_mrz_data(detection.mrz_data, index)
# Validate extracted fields
if detection.extracted_fields:
self._validate_extracted_fields(detection.extracted_fields, index)
def _validate_mrz_data(self, mrz_data: MRZData, index: int) -> None:
"""Validate MRZ data.
Args:
mrz_data: MRZ data to validate
index: Index of the detection
Raises:
ValueError: If MRZ data validation fails
"""
# Support both canonical and legacy attribute names
raw_text_value = getattr(mrz_data, "raw_text", None) or getattr(mrz_data, "raw_mrz", None)
if not raw_text_value:
raise ValueError(f"MRZ raw text cannot be empty for detection {index}")
format_type_value = getattr(mrz_data, "format_type", None) or getattr(mrz_data, "document_type", None)
if not format_type_value:
raise ValueError(f"MRZ format type cannot be empty for detection {index}")
if not (0.0 <= mrz_data.confidence <= 1.0):
raise ValueError(f"MRZ confidence must be between 0.0 and 1.0 for detection {index}")
def _validate_extracted_fields(self, fields: ExtractedFields, index: int) -> None:
"""Validate extracted fields.
Args:
fields: Extracted fields to validate
index: Index of the detection
Raises:
ValueError: If fields validation fails
"""
# Validate each field if present
for field_name, field_value in fields.__dict__.items():
if field_value is not None:
if not isinstance(field_value, ExtractedField):
raise ValueError(f"Field {field_name} must be ExtractedField instance for detection {index}")
# Validate field content
if not (0.0 <= field_value.confidence <= 1.0):
raise ValueError(f"Field {field_name} confidence must be between 0.0 and 1.0 for detection {index}")
def build_error_response(
self,
request_id: str,
error_message: str,
processing_time: float = 0.0
) -> OCRResponse:
"""Build an error response.
Args:
request_id: Unique request identifier
error_message: Error message
processing_time: Processing time before error
Returns:
Error OCRResponse object
"""
logger.error(f"Building error response: {error_message}")
return OCRResponse(
request_id=request_id,
media_type="image", # Default media type
processing_time=processing_time,
detections=[] # Empty detections for error
)
# Global response builder instance
_response_builder: Optional[OCRResponseBuilder] = None
def get_response_builder() -> OCRResponseBuilder:
"""Get the global response builder instance."""
global _response_builder
if _response_builder is None:
_response_builder = OCRResponseBuilder()
return _response_builder
def build_ocr_response(
request_id: str,
media_type: str,
processing_time: float,
ocr_texts: List[str],
page_metadata: Optional[List[Dict[str, Any]]] = None,
debug: bool = False,
) -> OCRResponse:
"""Build a complete OCR response from extracted texts."""
builder = get_response_builder()
return builder.build_response(
request_id=request_id,
media_type=media_type,
processing_time=processing_time,
ocr_texts=ocr_texts,
page_metadata=page_metadata,
debug=debug,
)
def build_error_response(
request_id: str,
error_message: str,
processing_time: float = 0.0
) -> OCRResponse:
"""Build an error response."""
builder = get_response_builder()
return builder.build_error_response(request_id, error_message, processing_time)
|