Spaces:

algoryn
/

dots-ocr-idcard

Paused

App Files Files Community

tommulder commited on Sep 15

Commit

5537ceb

1 Parent(s): d405999

style: format Python files with Black

Browse files

Files changed (8) hide show

src/kybtech_dots_ocr/__init__.py +8 -2
src/kybtech_dots_ocr/api_models.py +40 -11
src/kybtech_dots_ocr/app.py +78 -40
src/kybtech_dots_ocr/enhanced_field_extraction.py +118 -80
src/kybtech_dots_ocr/field_extraction.py +28 -32
src/kybtech_dots_ocr/models.py +33 -10
src/kybtech_dots_ocr/preprocessing.py +72 -67
src/kybtech_dots_ocr/response_builder.py +53 -3

src/kybtech_dots_ocr/__init__.py CHANGED Viewed

@@ -8,7 +8,13 @@ __author__ = "Algoryn"
 __email__ = "info@algoryn.com"
 from .app import app
-from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
 from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
 from .preprocessing import process_document, validate_file_size, get_document_info
 from .response_builder import build_ocr_response, build_error_response
@@ -16,7 +22,7 @@ from .response_builder import build_ocr_response, build_error_response
 __all__ = [
     "app",
     "OCRResponse",
-    "OCRDetection",
     "ExtractedFields",
     "MRZData",
     "ExtractedField",

 __email__ = "info@algoryn.com"
 from .app import app
+from .api_models import (
+    OCRResponse,
+    OCRDetection,
+    ExtractedFields,
+    MRZData,
+    ExtractedField,
+)
 from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
 from .preprocessing import process_document, validate_file_size, get_document_info
 from .response_builder import build_ocr_response, build_error_response
 __all__ = [
     "app",
     "OCRResponse",
+    "OCRDetection",
     "ExtractedFields",
     "MRZData",
     "ExtractedField",

src/kybtech_dots_ocr/api_models.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
 class BoundingBox(BaseModel):
     """Normalized bounding box coordinates."""
     x1: float = Field(..., ge=0.0, le=1.0, description="Top-left x coordinate")
     y1: float = Field(..., ge=0.0, le=1.0, description="Top-left y coordinate")
     x2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right x coordinate")
@@ -18,6 +19,7 @@ class BoundingBox(BaseModel):
 class ExtractedField(BaseModel):
     """Individual extracted field with confidence and source."""
     field_name: str = Field(..., description="Standardized field name")
     value: Optional[str] = Field(None, description="Extracted field value")
     confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
@@ -26,10 +28,19 @@ class ExtractedField(BaseModel):
 class IdCardFields(BaseModel):
     """Structured fields extracted from identity documents."""
-    document_number: Optional[ExtractedField] = Field(None, description="Document number/ID")
-    document_type: Optional[ExtractedField] = Field(None, description="Type of document")
-    issuing_country: Optional[ExtractedField] = Field(None, description="Issuing country code")
-    issuing_authority: Optional[ExtractedField] = Field(None, description="Issuing authority")
     # Personal Information
     surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
@@ -42,15 +53,22 @@ class IdCardFields(BaseModel):
     # Validity Information
     date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
     date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
-    personal_number: Optional[ExtractedField] = Field(None, description="Personal number")
     # Additional fields for specific document types
-    optional_data_1: Optional[ExtractedField] = Field(None, description="Optional data field 1")
-    optional_data_2: Optional[ExtractedField] = Field(None, description="Optional data field 2")
 class ExtractedFields(BaseModel):
     """All extracted fields from identity document."""
     document_number: Optional[ExtractedField] = None
     document_type: Optional[ExtractedField] = None
     issuing_country: Optional[ExtractedField] = None
@@ -70,8 +88,11 @@ class ExtractedFields(BaseModel):
 class MRZData(BaseModel):
     """Machine Readable Zone data."""
     # Primary canonical fields
-    document_type: Optional[str] = Field(None, description="MRZ document type (TD1|TD2|TD3)")
     issuing_country: Optional[str] = Field(None, description="Issuing country code")
     surname: Optional[str] = Field(None, description="Surname from MRZ")
     given_names: Optional[str] = Field(None, description="Given names from MRZ")
@@ -82,22 +103,30 @@ class MRZData(BaseModel):
     date_of_expiry: Optional[str] = Field(None, description="Date of expiry from MRZ")
     personal_number: Optional[str] = Field(None, description="Personal number from MRZ")
     raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
-    confidence: float = Field(0.0, ge=0.0, le=1.0, description="MRZ extraction confidence")
     # Backwards compatibility fields (some older code/tests expect these names)
     # These duplicate information from the canonical fields above.
-    format_type: Optional[str] = Field(None, description="Alias of document_type for backward compatibility")
-    raw_text: Optional[str] = Field(None, description="Alias of raw_mrz for backward compatibility")
 class OCRDetection(BaseModel):
     """Single OCR detection result."""
     mrz_data: Optional[MRZData] = Field(None, description="MRZ data if detected")
     extracted_fields: ExtractedFields = Field(..., description="Extracted field data")
 class OCRResponse(BaseModel):
     """OCR API response."""
     request_id: str = Field(..., description="Unique request identifier")
     media_type: str = Field(..., description="Media type processed")
     processing_time: float = Field(..., description="Processing time in seconds")

 class BoundingBox(BaseModel):
     """Normalized bounding box coordinates."""
     x1: float = Field(..., ge=0.0, le=1.0, description="Top-left x coordinate")
     y1: float = Field(..., ge=0.0, le=1.0, description="Top-left y coordinate")
     x2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right x coordinate")
 class ExtractedField(BaseModel):
     """Individual extracted field with confidence and source."""
     field_name: str = Field(..., description="Standardized field name")
     value: Optional[str] = Field(None, description="Extracted field value")
     confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
 class IdCardFields(BaseModel):
     """Structured fields extracted from identity documents."""
+    document_number: Optional[ExtractedField] = Field(
+        None, description="Document number/ID"
+    )
+    document_type: Optional[ExtractedField] = Field(
+        None, description="Type of document"
+    )
+    issuing_country: Optional[ExtractedField] = Field(
+        None, description="Issuing country code"
+    )
+    issuing_authority: Optional[ExtractedField] = Field(
+        None, description="Issuing authority"
+    )
     # Personal Information
     surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
     # Validity Information
     date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
     date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
+    personal_number: Optional[ExtractedField] = Field(
+        None, description="Personal number"
+    )
     # Additional fields for specific document types
+    optional_data_1: Optional[ExtractedField] = Field(
+        None, description="Optional data field 1"
+    )
+    optional_data_2: Optional[ExtractedField] = Field(
+        None, description="Optional data field 2"
+    )
 class ExtractedFields(BaseModel):
     """All extracted fields from identity document."""
     document_number: Optional[ExtractedField] = None
     document_type: Optional[ExtractedField] = None
     issuing_country: Optional[ExtractedField] = None
 class MRZData(BaseModel):
     """Machine Readable Zone data."""
     # Primary canonical fields
+    document_type: Optional[str] = Field(
+        None, description="MRZ document type (TD1|TD2|TD3)"
+    )
     issuing_country: Optional[str] = Field(None, description="Issuing country code")
     surname: Optional[str] = Field(None, description="Surname from MRZ")
     given_names: Optional[str] = Field(None, description="Given names from MRZ")
     date_of_expiry: Optional[str] = Field(None, description="Date of expiry from MRZ")
     personal_number: Optional[str] = Field(None, description="Personal number from MRZ")
     raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
+    confidence: float = Field(
+        0.0, ge=0.0, le=1.0, description="MRZ extraction confidence"
+    )
     # Backwards compatibility fields (some older code/tests expect these names)
     # These duplicate information from the canonical fields above.
+    format_type: Optional[str] = Field(
+        None, description="Alias of document_type for backward compatibility"
+    )
+    raw_text: Optional[str] = Field(
+        None, description="Alias of raw_mrz for backward compatibility"
+    )
 class OCRDetection(BaseModel):
     """Single OCR detection result."""
     mrz_data: Optional[MRZData] = Field(None, description="MRZ data if detected")
     extracted_fields: ExtractedFields = Field(..., description="Extracted field data")
 class OCRResponse(BaseModel):
     """OCR API response."""
     request_id: str = Field(..., description="Unique request identifier")
     media_type: str = Field(..., description="Media type processed")
     processing_time: float = Field(..., description="Processing time in seconds")

src/kybtech_dots_ocr/app.py CHANGED Viewed

@@ -17,7 +17,14 @@ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
 # Import local modules
-from .api_models import BoundingBox, ExtractedField, ExtractedFields, MRZData, OCRDetection, OCRResponse
 from .enhanced_field_extraction import EnhancedFieldExtractor
 from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
 from .preprocessing import process_document, validate_file_size, get_document_info
@@ -27,6 +34,13 @@ from .response_builder import build_ocr_response, build_error_response
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global model state
 model_loaded = False
@@ -34,13 +48,11 @@ model_loaded = False
 # FieldExtractor is now imported from the shared module
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for model loading."""
     global model_loaded
     # Allow tests and lightweight environments to skip model loading
     # Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
     skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
@@ -50,25 +62,27 @@ async def lifespan(app: FastAPI):
         if skip_model_load:
             # Explicitly skip model loading for fast startup in tests/CI
             model_loaded = False
-            logger.warning("DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)")
         else:
             # Load the model using the new model loader
             load_model()
             model_loaded = True
             logger.info("Dots.OCR model loaded successfully")
             # Log model information
             model_info = get_model_info()
             logger.info(f"Model info: {model_info}")
     except Exception as e:
         logger.error(f"Failed to load Dots.OCR model: {e}")
         # Don't raise - allow mock mode for development
         model_loaded = False
         logger.warning("Model loading failed - using mock implementation")
     yield
     logger.info("Shutting down Dots.OCR endpoint...")
@@ -76,61 +90,79 @@ app = FastAPI(
     title="KYB Dots.OCR Text Extraction",
     description="Dots.OCR for identity document text extraction with ROI support",
     version="1.0.0",
-    lifespan=lifespan
 )
 @app.get("/")
 async def root():
     """Root route for uptime checks."""
-    return {"status": "ok", "service": "kybtech-dots-ocr", "version": "1.0.0"}
 @app.get("/health")
 async def health_check():
     """Health check endpoint."""
     global model_loaded
     status = "healthy" if model_loaded else "degraded"
     model_info = get_model_info() if model_loaded else None
     return {
-        "status": status,
         "version": "1.0.0",
         "model_loaded": model_loaded,
-        "model_info": model_info
     }
 @app.post("/v1/id/ocr", response_model=OCRResponse)
 async def extract_text_endpoint(
     file: UploadFile = File(..., description="Image or PDF file to process"),
-    roi: Optional[str] = Form(None, description="ROI coordinates as JSON string")
 ):
     """Extract text from identity document image or PDF."""
     global model_loaded
     # Allow mock mode when model isn't loaded to support tests/CI and dev flows
     allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
     is_mock_mode = (not model_loaded) and allow_mock
     if not model_loaded and not allow_mock:
         raise HTTPException(status_code=503, detail="Model not loaded")
     start_time = time.time()
     request_id = str(uuid.uuid4())
     try:
         # Read file data
         file_data = await file.read()
         # Validate file size
         if not validate_file_size(file_data):
             raise HTTPException(status_code=413, detail="File size exceeds limit")
         # Get document information
         doc_info = get_document_info(file_data)
         logger.info(f"Processing document: {doc_info}")
         # Parse ROI if provided
         roi_coords = None
         if roi:
@@ -142,19 +174,21 @@ async def extract_text_endpoint(
             except Exception as e:
                 logger.warning(f"Invalid ROI provided: {e}")
                 raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
         # Process document (PDF to images or single image)
         try:
             processed_images = process_document(file_data, roi_coords)
             logger.info(f"Processed {len(processed_images)} images from document")
         except Exception as e:
             logger.error(f"Document processing failed: {e}")
-            raise HTTPException(status_code=400, detail=f"Document processing failed: {e}")
         # Process each image and extract text
         ocr_texts = []
         page_metadata = []
         for i, image in enumerate(processed_images):
             try:
                 # Extract text using the loaded model, or produce mock output in mock mode
@@ -163,47 +197,50 @@ async def extract_text_endpoint(
                     ocr_text = ""
                 else:
                     ocr_text = extract_text(image)
-                logger.info(f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters")
                 ocr_texts.append(ocr_text)
                 # Collect page metadata
                 page_meta = {
                     "page_index": i,
                     "image_size": image.size,
                     "text_length": len(ocr_text),
-                    "processing_successful": True
                 }
                 page_metadata.append(page_meta)
             except Exception as e:
                 logger.error(f"Text extraction failed for page {i + 1}: {e}")
                 # Add empty text for failed page
                 ocr_texts.append("")
                 page_meta = {
                     "page_index": i,
-                    "image_size": image.size if hasattr(image, 'size') else (0, 0),
                     "text_length": 0,
                     "processing_successful": False,
-                    "error": str(e)
                 }
                 page_metadata.append(page_meta)
         # Determine media type for response
         media_type = "pdf" if doc_info["is_pdf"] else "image"
         processing_time = time.time() - start_time
         # Build response using the response builder
         return build_ocr_response(
             request_id=request_id,
             media_type=media_type,
             processing_time=processing_time,
             ocr_texts=ocr_texts,
-            page_metadata=page_metadata
         )
     except HTTPException:
         # Re-raise HTTP exceptions as-is
         raise
@@ -213,11 +250,12 @@ async def extract_text_endpoint(
         error_response = build_error_response(
             request_id=request_id,
             error_message=f"OCR extraction failed: {str(e)}",
-            processing_time=processing_time
         )
         raise HTTPException(status_code=500, detail=error_response.dict())
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi.responses import JSONResponse
 # Import local modules
+from .api_models import (
+    BoundingBox,
+    ExtractedField,
+    ExtractedFields,
+    MRZData,
+    OCRDetection,
+    OCRResponse,
+)
 from .enhanced_field_extraction import EnhancedFieldExtractor
 from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
 from .preprocessing import process_document, validate_file_size, get_document_info
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Enable verbose logging globally if DOTS_OCR_DEBUG env var is set.
+_env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
+if _env_debug:
+    # Elevate root logger to DEBUG to include lower-level events from submodules
+    logging.getLogger().setLevel(logging.DEBUG)
+    logger.info("DOTS_OCR_DEBUG enabled via environment — verbose logging active")
 # Global model state
 model_loaded = False
 # FieldExtractor is now imported from the shared module
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for model loading."""
     global model_loaded
     # Allow tests and lightweight environments to skip model loading
     # Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
     skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
         if skip_model_load:
             # Explicitly skip model loading for fast startup in tests/CI
             model_loaded = False
+            logger.warning(
+                "DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)"
+            )
         else:
             # Load the model using the new model loader
             load_model()
             model_loaded = True
             logger.info("Dots.OCR model loaded successfully")
             # Log model information
             model_info = get_model_info()
             logger.info(f"Model info: {model_info}")
     except Exception as e:
         logger.error(f"Failed to load Dots.OCR model: {e}")
         # Don't raise - allow mock mode for development
         model_loaded = False
         logger.warning("Model loading failed - using mock implementation")
     yield
     logger.info("Shutting down Dots.OCR endpoint...")
     title="KYB Dots.OCR Text Extraction",
     description="Dots.OCR for identity document text extraction with ROI support",
     version="1.0.0",
+    lifespan=lifespan,
 )
 @app.get("/")
 async def root():
     """Root route for uptime checks."""
+    return {"status": "ok"}
 @app.get("/health")
 async def health_check():
     """Health check endpoint."""
     global model_loaded
     status = "healthy" if model_loaded else "degraded"
     model_info = get_model_info() if model_loaded else None
     return {
+        "status": status,
         "version": "1.0.0",
         "model_loaded": model_loaded,
+        "model_info": model_info,
     }
 @app.post("/v1/id/ocr", response_model=OCRResponse)
 async def extract_text_endpoint(
     file: UploadFile = File(..., description="Image or PDF file to process"),
+    roi: Optional[str] = Form(None, description="ROI coordinates as JSON string"),
+    debug: Optional[bool] = Form(
+        None,
+        description=(
+            "Enable verbose debug logging for this request. Overrides env when True."
+        ),
+    ),
 ):
     """Extract text from identity document image or PDF."""
     global model_loaded
     # Allow mock mode when model isn't loaded to support tests/CI and dev flows
     allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
     is_mock_mode = (not model_loaded) and allow_mock
     if not model_loaded and not allow_mock:
         raise HTTPException(status_code=503, detail="Model not loaded")
+    # Determine effective debug mode for this request
+    env_debug = os.getenv("DOTS_OCR_DEBUG", "0").lower() in {"1", "true", "yes"}
+    debug_enabled = bool(debug) if debug is not None else env_debug
+    if debug_enabled:
+        logger.info(
+            f"[debug] Request {request_id}: debug logging enabled (env={env_debug}, form={debug})"
+        )
+    if is_mock_mode:
+        logger.warning(
+            "Using mock mode — OCR text will be empty. To enable real inference, ensure the model loads successfully (unset DOTS_OCR_SKIP_MODEL_LOAD and provide resources)."
+        )
     start_time = time.time()
     request_id = str(uuid.uuid4())
     try:
         # Read file data
         file_data = await file.read()
         # Validate file size
         if not validate_file_size(file_data):
             raise HTTPException(status_code=413, detail="File size exceeds limit")
         # Get document information
         doc_info = get_document_info(file_data)
         logger.info(f"Processing document: {doc_info}")
         # Parse ROI if provided
         roi_coords = None
         if roi:
             except Exception as e:
                 logger.warning(f"Invalid ROI provided: {e}")
                 raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
         # Process document (PDF to images or single image)
         try:
             processed_images = process_document(file_data, roi_coords)
             logger.info(f"Processed {len(processed_images)} images from document")
         except Exception as e:
             logger.error(f"Document processing failed: {e}")
+            raise HTTPException(
+                status_code=400, detail=f"Document processing failed: {e}"
+            )
         # Process each image and extract text
         ocr_texts = []
         page_metadata = []
         for i, image in enumerate(processed_images):
             try:
                 # Extract text using the loaded model, or produce mock output in mock mode
                     ocr_text = ""
                 else:
                     ocr_text = extract_text(image)
+                logger.info(
+                    f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters"
+                )
                 ocr_texts.append(ocr_text)
                 # Collect page metadata
                 page_meta = {
                     "page_index": i,
                     "image_size": image.size,
                     "text_length": len(ocr_text),
+                    "processing_successful": True,
                 }
                 page_metadata.append(page_meta)
             except Exception as e:
                 logger.error(f"Text extraction failed for page {i + 1}: {e}")
                 # Add empty text for failed page
                 ocr_texts.append("")
                 page_meta = {
                     "page_index": i,
+                    "image_size": image.size if hasattr(image, "size") else (0, 0),
                     "text_length": 0,
                     "processing_successful": False,
+                    "error": str(e),
                 }
                 page_metadata.append(page_meta)
         # Determine media type for response
         media_type = "pdf" if doc_info["is_pdf"] else "image"
         processing_time = time.time() - start_time
         # Build response using the response builder
         return build_ocr_response(
             request_id=request_id,
             media_type=media_type,
             processing_time=processing_time,
             ocr_texts=ocr_texts,
+            page_metadata=page_metadata,
+            debug=debug_enabled,
         )
     except HTTPException:
         # Re-raise HTTP exceptions as-is
         raise
         error_response = build_error_response(
             request_id=request_id,
             error_message=f"OCR extraction failed: {str(e)}",
+            processing_time=processing_time,
         )
         raise HTTPException(status_code=500, detail=error_response.dict())
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

src/kybtech_dots_ocr/enhanced_field_extraction.py CHANGED Viewed

@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 class EnhancedFieldExtractor:
     """Enhanced field extraction with improved confidence scoring and validation."""
     # Enhanced field mapping patterns with confidence scoring
     FIELD_PATTERNS = {
         "document_number": [
@@ -35,7 +35,10 @@ class EnhancedFieldExtractor:
         ],
         "given_names": [
             (r"^\s*voornamen[:\s]*([^\r\n]+)", 0.95),  # Dutch format (line-anchored)
-            (r"^\s*given\s*names[:\s]*([^\r\n]+)", 0.9),  # English format (line-anchored)
             (r"^\s*first\s*name[:\s]*([^\r\n]+)", 0.85),  # First name only
             (r"^\s*voorletters[:\s]*([^\r\n]+)", 0.75),  # Dutch initials
         ],
@@ -46,7 +49,10 @@ class EnhancedFieldExtractor:
         ],
         "date_of_birth": [
             (r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
-            (r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
             (r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
             (r"(\d{2}[./-]\d{2}[./-]\d{4})", 0.6),  # Generic date pattern
         ],
@@ -64,14 +70,23 @@ class EnhancedFieldExtractor:
         ],
         "date_of_issue": [
             (r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
-            (r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
             (r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
         ],
         "date_of_expiry": [
             (r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
-            (r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
             (r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
-            (r"valid\s*until[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Alternative English
         ],
         "personal_number": [
             (r"persoonsnummer[:\s]*(\d{9})", 0.9),  # Dutch format
@@ -95,39 +110,48 @@ class EnhancedFieldExtractor:
             (r"issuing\s*authority[:\s]*([A-Za-z\s]{3,30})", 0.8),  # English format
             (r"uitgevende\s*autoriteit[:\s]*([A-Za-z\s]{3,30})", 0.9),  # Dutch format
             (r"authority[:\s]*([A-Za-z\s]{3,30})", 0.7),  # Short format
-        ]
     }
     # MRZ patterns with confidence scoring
     MRZ_PATTERNS = [
         # Strict formats first, allowing leading/trailing whitespace per line
-        (r"^\s*((?:[A-Z0-9<]{44})\s*\n\s*(?:[A-Z0-9<]{44}))\s*$", 0.95),  # TD3: Passport (2 x 44)
-        (r"^\s*((?:[A-Z0-9<]{36})\s*\n\s*(?:[A-Z0-9<]{36}))\s*$", 0.9),   # TD2: ID card (2 x 36)
-        (r"^\s*((?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30}))\s*$", 0.85),  # TD1: (3 x 30)
         # Fallback generic: a line starting with P< followed by another MRZ-like line
         (r"(P<[^\r\n]+\n[^\r\n]+)", 0.85),
     ]
     @classmethod
     def extract_fields(cls, ocr_text: str) -> IdCardFields:
         """Extract structured fields from OCR text with enhanced confidence scoring.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             IdCardFields object with extracted field data
         """
         logger.info(f"Extracting fields from text of length: {len(ocr_text)}")
         fields = {}
         extraction_stats = {"total_patterns": 0, "matches_found": 0}
         for field_name, patterns in cls.FIELD_PATTERNS.items():
             value = None
             confidence = 0.0
             best_pattern = None
             for pattern, base_confidence in patterns:
                 extraction_stats["total_patterns"] += 1
                 match = re.search(pattern, ocr_text, re.IGNORECASE | re.MULTILINE)
@@ -139,37 +163,43 @@ class EnhancedFieldExtractor:
                         confidence = base_confidence
                         best_pattern = pattern
                         extraction_stats["matches_found"] += 1
-                        logger.debug(f"Found {field_name}: '{value}' (confidence: {confidence:.2f})")
                         break
             if value:
                 # Apply additional confidence adjustments
-                confidence = cls._adjust_confidence(field_name, value, confidence, ocr_text)
                 fields[field_name] = ExtractedField(
                     field_name=field_name,
                     value=value,
                     confidence=confidence,
-                    source="ocr"
                 )
-        logger.info(f"Field extraction complete: {extraction_stats['matches_found']}/{extraction_stats['total_patterns']} patterns matched")
         return IdCardFields(**fields)
     @classmethod
     def _validate_field_value(cls, field_name: str, value: str) -> bool:
         """Validate extracted field value based on field type.
         Args:
             field_name: Name of the field
             value: Extracted value to validate
         Returns:
             True if value is valid
         """
         if not value or len(value.strip()) == 0:
             return False
         # Field-specific validation
         if field_name == "document_number":
             return len(value) >= 6 and len(value) <= 15
@@ -185,16 +215,16 @@ class EnhancedFieldExtractor:
             return len(value) == 9 and value.isdigit()
         elif field_name == "issuing_country":
             return len(value) == 3 and value.isalpha()
         return True
     @classmethod
     def _validate_date_format(cls, date_str: str) -> bool:
         """Validate date format and basic date logic.
         Args:
             date_str: Date string to validate
         Returns:
             True if date format is valid
         """
@@ -206,59 +236,63 @@ class EnhancedFieldExtractor:
                     if len(parts) == 3:
                         day, month, year = parts
                         # Basic validation
-                        if (1 <= int(day) <= 31 and
-                            1 <= int(month) <= 12 and
-                            1900 <= int(year) <= 2100):
                             return True
         except (ValueError, IndexError):
             pass
         return False
     @classmethod
-    def _adjust_confidence(cls, field_name: str, value: str, base_confidence: float, full_text: str) -> float:
         """Adjust confidence based on additional factors.
         Args:
             field_name: Name of the field
             value: Extracted value
             base_confidence: Base confidence from pattern matching
             full_text: Full OCR text for context
         Returns:
             Adjusted confidence score
         """
         confidence = base_confidence
         # Length-based adjustments
         if field_name in ["surname", "given_names"] and len(value) < 3:
             confidence *= 0.8  # Shorter names are less reliable
         # Context-based adjustments
         if field_name == "document_number" and "passport" in full_text.lower():
             confidence *= 1.1  # Higher confidence in passport context
         # Multiple occurrence bonus
         if value in full_text and full_text.count(value) > 1:
             confidence *= 1.05  # Slight bonus for repeated values
         # Ensure confidence stays within bounds
         return min(max(confidence, 0.0), 1.0)
     @classmethod
     def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
         """Extract MRZ data from OCR text with enhanced validation.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             MRZData object if MRZ detected, None otherwise
         """
         logger.info("Extracting MRZ data from OCR text")
         best_match = None
         best_confidence = 0.0
         for pattern, base_confidence in cls.MRZ_PATTERNS:
             match = re.search(pattern, ocr_text, re.MULTILINE)
             if match:
@@ -268,23 +302,24 @@ class EnhancedFieldExtractor:
                     confidence = base_confidence
                     # Adjust confidence based on MRZ quality
                     confidence = cls._adjust_mrz_confidence(raw_mrz, confidence)
                     if confidence > best_confidence:
                         best_match = raw_mrz
                         best_confidence = confidence
                         logger.debug(f"Found MRZ with confidence {confidence:.2f}")
         if best_match:
             # Parse MRZ to determine format type
             format_type = cls._determine_mrz_format(best_match)
             # Basic checksum validation
             is_valid, errors = cls._validate_mrz_checksums(best_match, format_type)
             logger.info(f"MRZ extracted: {format_type} format, valid: {is_valid}")
             # Convert to the format expected by the API
             from .api_models import MRZData as APIMRZData
             # Populate both canonical and legacy alias fields for compatibility
             return APIMRZData(
                 document_type=format_type,
@@ -302,47 +337,47 @@ class EnhancedFieldExtractor:
                 raw_text=best_match,  # legacy alias
                 confidence=best_confidence,
             )
         logger.info("No MRZ data found in OCR text")
         return None
     @classmethod
     def _validate_mrz_format(cls, mrz_text: str) -> bool:
         """Validate basic MRZ format.
         Args:
             mrz_text: Raw MRZ text
         Returns:
             True if format is valid
         """
-        lines = mrz_text.strip().split('\n')
         if len(lines) < 2:
             return False
         # Normalize whitespace and validate character set only.
         normalized_lines = [re.sub(r"\s+", "", line) for line in lines]
         for line in normalized_lines:
-            if not re.match(r'^[A-Z0-9<]+$', line):
                 return False
         return True
     @classmethod
     def _determine_mrz_format(cls, mrz_text: str) -> str:
         """Determine MRZ format type.
         Args:
             mrz_text: Raw MRZ text
         Returns:
             Format type (TD1, TD2, TD3, etc.)
         """
-        lines = mrz_text.strip().split('\n')
         lines = [re.sub(r"\s+", "", line) for line in lines]
         line_count = len(lines)
         line_length = len(lines[0]) if lines else 0
         # Heuristic mapping: prioritize semantics over exact lengths for robustness
         if line_count == 2 and lines[0].startswith("P<"):
             return "TD3"  # Passport format commonly starts with P<
@@ -351,53 +386,56 @@ class EnhancedFieldExtractor:
         if line_count == 3:
             return "TD1"
         return "UNKNOWN"
     @classmethod
     def _adjust_mrz_confidence(cls, mrz_text: str, base_confidence: float) -> float:
         """Adjust MRZ confidence based on quality indicators.
         Args:
             mrz_text: Raw MRZ text
             base_confidence: Base confidence from pattern matching
         Returns:
             Adjusted confidence
         """
         confidence = base_confidence
         # Check line consistency
-        lines = mrz_text.strip().split('\n')
         if len(set(len(line) for line in lines)) == 1:
             confidence *= 1.05  # Bonus for consistent line lengths
         return min(max(confidence, 0.0), 1.0)
     @classmethod
-    def _validate_mrz_checksums(cls, mrz_text: str, format_type: str) -> Tuple[bool, List[str]]:
         """Validate MRZ checksums (simplified implementation).
         Args:
             mrz_text: Raw MRZ text
             format_type: MRZ format type
         Returns:
             Tuple of (is_valid, list_of_errors)
         """
         # This is a simplified implementation
         # In production, you would implement full MRZ checksum validation
         errors = []
         # Basic validation - check for reasonable character distribution
-        if mrz_text.count('<') > len(mrz_text) * 0.3:
             errors.append("Too many fill characters")
         # For now, assume valid if basic format is correct
         is_valid = len(errors) == 0
         return is_valid, errors
 # Backward compatibility - use enhanced extractor as default
 class FieldExtractor(EnhancedFieldExtractor):
     """Backward compatible field extractor using enhanced implementation."""
     pass

 class EnhancedFieldExtractor:
     """Enhanced field extraction with improved confidence scoring and validation."""
     # Enhanced field mapping patterns with confidence scoring
     FIELD_PATTERNS = {
         "document_number": [
         ],
         "given_names": [
             (r"^\s*voornamen[:\s]*([^\r\n]+)", 0.95),  # Dutch format (line-anchored)
+            (
+                r"^\s*given\s*names[:\s]*([^\r\n]+)",
+                0.9,
+            ),  # English format (line-anchored)
             (r"^\s*first\s*name[:\s]*([^\r\n]+)", 0.85),  # First name only
             (r"^\s*voorletters[:\s]*([^\r\n]+)", 0.75),  # Dutch initials
         ],
         ],
         "date_of_birth": [
             (r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (
+                r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+                0.85,
+            ),  # English format
             (r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
             (r"(\d{2}[./-]\d{2}[./-]\d{4})", 0.6),  # Generic date pattern
         ],
         ],
         "date_of_issue": [
             (r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (
+                r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+                0.85,
+            ),  # English format
             (r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
         ],
         "date_of_expiry": [
             (r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (
+                r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+                0.85,
+            ),  # English format
             (r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
+            (
+                r"valid\s*until[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+                0.8,
+            ),  # Alternative English
         ],
         "personal_number": [
             (r"persoonsnummer[:\s]*(\d{9})", 0.9),  # Dutch format
             (r"issuing\s*authority[:\s]*([A-Za-z\s]{3,30})", 0.8),  # English format
             (r"uitgevende\s*autoriteit[:\s]*([A-Za-z\s]{3,30})", 0.9),  # Dutch format
             (r"authority[:\s]*([A-Za-z\s]{3,30})", 0.7),  # Short format
+        ],
     }
     # MRZ patterns with confidence scoring
     MRZ_PATTERNS = [
         # Strict formats first, allowing leading/trailing whitespace per line
+        (
+            r"^\s*((?:[A-Z0-9<]{44})\s*\n\s*(?:[A-Z0-9<]{44}))\s*$",
+            0.95,
+        ),  # TD3: Passport (2 x 44)
+        (
+            r"^\s*((?:[A-Z0-9<]{36})\s*\n\s*(?:[A-Z0-9<]{36}))\s*$",
+            0.9,
+        ),  # TD2: ID card (2 x 36)
+        (
+            r"^\s*((?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30}))\s*$",
+            0.85,
+        ),  # TD1: (3 x 30)
         # Fallback generic: a line starting with P< followed by another MRZ-like line
         (r"(P<[^\r\n]+\n[^\r\n]+)", 0.85),
     ]
     @classmethod
     def extract_fields(cls, ocr_text: str) -> IdCardFields:
         """Extract structured fields from OCR text with enhanced confidence scoring.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             IdCardFields object with extracted field data
         """
         logger.info(f"Extracting fields from text of length: {len(ocr_text)}")
         fields = {}
         extraction_stats = {"total_patterns": 0, "matches_found": 0}
         for field_name, patterns in cls.FIELD_PATTERNS.items():
             value = None
             confidence = 0.0
             best_pattern = None
             for pattern, base_confidence in patterns:
                 extraction_stats["total_patterns"] += 1
                 match = re.search(pattern, ocr_text, re.IGNORECASE | re.MULTILINE)
                         confidence = base_confidence
                         best_pattern = pattern
                         extraction_stats["matches_found"] += 1
+                        logger.debug(
+                            f"Found {field_name}: '{value}' (confidence: {confidence:.2f})"
+                        )
                         break
             if value:
                 # Apply additional confidence adjustments
+                confidence = cls._adjust_confidence(
+                    field_name, value, confidence, ocr_text
+                )
                 fields[field_name] = ExtractedField(
                     field_name=field_name,
                     value=value,
                     confidence=confidence,
+                    source="ocr",
                 )
+        logger.info(
+            f"Field extraction complete: {extraction_stats['matches_found']}/{extraction_stats['total_patterns']} patterns matched"
+        )
         return IdCardFields(**fields)
     @classmethod
     def _validate_field_value(cls, field_name: str, value: str) -> bool:
         """Validate extracted field value based on field type.
         Args:
             field_name: Name of the field
             value: Extracted value to validate
         Returns:
             True if value is valid
         """
         if not value or len(value.strip()) == 0:
             return False
         # Field-specific validation
         if field_name == "document_number":
             return len(value) >= 6 and len(value) <= 15
             return len(value) == 9 and value.isdigit()
         elif field_name == "issuing_country":
             return len(value) == 3 and value.isalpha()
         return True
     @classmethod
     def _validate_date_format(cls, date_str: str) -> bool:
         """Validate date format and basic date logic.
         Args:
             date_str: Date string to validate
         Returns:
             True if date format is valid
         """
                     if len(parts) == 3:
                         day, month, year = parts
                         # Basic validation
+                        if (
+                            1 <= int(day) <= 31
+                            and 1 <= int(month) <= 12
+                            and 1900 <= int(year) <= 2100
+                        ):
                             return True
         except (ValueError, IndexError):
             pass
         return False
     @classmethod
+    def _adjust_confidence(
+        cls, field_name: str, value: str, base_confidence: float, full_text: str
+    ) -> float:
         """Adjust confidence based on additional factors.
         Args:
             field_name: Name of the field
             value: Extracted value
             base_confidence: Base confidence from pattern matching
             full_text: Full OCR text for context
         Returns:
             Adjusted confidence score
         """
         confidence = base_confidence
         # Length-based adjustments
         if field_name in ["surname", "given_names"] and len(value) < 3:
             confidence *= 0.8  # Shorter names are less reliable
         # Context-based adjustments
         if field_name == "document_number" and "passport" in full_text.lower():
             confidence *= 1.1  # Higher confidence in passport context
         # Multiple occurrence bonus
         if value in full_text and full_text.count(value) > 1:
             confidence *= 1.05  # Slight bonus for repeated values
         # Ensure confidence stays within bounds
         return min(max(confidence, 0.0), 1.0)
     @classmethod
     def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
         """Extract MRZ data from OCR text with enhanced validation.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             MRZData object if MRZ detected, None otherwise
         """
         logger.info("Extracting MRZ data from OCR text")
         best_match = None
         best_confidence = 0.0
         for pattern, base_confidence in cls.MRZ_PATTERNS:
             match = re.search(pattern, ocr_text, re.MULTILINE)
             if match:
                     confidence = base_confidence
                     # Adjust confidence based on MRZ quality
                     confidence = cls._adjust_mrz_confidence(raw_mrz, confidence)
                     if confidence > best_confidence:
                         best_match = raw_mrz
                         best_confidence = confidence
                         logger.debug(f"Found MRZ with confidence {confidence:.2f}")
         if best_match:
             # Parse MRZ to determine format type
             format_type = cls._determine_mrz_format(best_match)
             # Basic checksum validation
             is_valid, errors = cls._validate_mrz_checksums(best_match, format_type)
             logger.info(f"MRZ extracted: {format_type} format, valid: {is_valid}")
             # Convert to the format expected by the API
             from .api_models import MRZData as APIMRZData
             # Populate both canonical and legacy alias fields for compatibility
             return APIMRZData(
                 document_type=format_type,
                 raw_text=best_match,  # legacy alias
                 confidence=best_confidence,
             )
         logger.info("No MRZ data found in OCR text")
         return None
     @classmethod
     def _validate_mrz_format(cls, mrz_text: str) -> bool:
         """Validate basic MRZ format.
         Args:
             mrz_text: Raw MRZ text
         Returns:
             True if format is valid
         """
+        lines = mrz_text.strip().split("\n")
         if len(lines) < 2:
             return False
         # Normalize whitespace and validate character set only.
         normalized_lines = [re.sub(r"\s+", "", line) for line in lines]
         for line in normalized_lines:
+            if not re.match(r"^[A-Z0-9<]+$", line):
                 return False
         return True
     @classmethod
     def _determine_mrz_format(cls, mrz_text: str) -> str:
         """Determine MRZ format type.
         Args:
             mrz_text: Raw MRZ text
         Returns:
             Format type (TD1, TD2, TD3, etc.)
         """
+        lines = mrz_text.strip().split("\n")
         lines = [re.sub(r"\s+", "", line) for line in lines]
         line_count = len(lines)
         line_length = len(lines[0]) if lines else 0
         # Heuristic mapping: prioritize semantics over exact lengths for robustness
         if line_count == 2 and lines[0].startswith("P<"):
             return "TD3"  # Passport format commonly starts with P<
         if line_count == 3:
             return "TD1"
         return "UNKNOWN"
     @classmethod
     def _adjust_mrz_confidence(cls, mrz_text: str, base_confidence: float) -> float:
         """Adjust MRZ confidence based on quality indicators.
         Args:
             mrz_text: Raw MRZ text
             base_confidence: Base confidence from pattern matching
         Returns:
             Adjusted confidence
         """
         confidence = base_confidence
         # Check line consistency
+        lines = mrz_text.strip().split("\n")
         if len(set(len(line) for line in lines)) == 1:
             confidence *= 1.05  # Bonus for consistent line lengths
         return min(max(confidence, 0.0), 1.0)
     @classmethod
+    def _validate_mrz_checksums(
+        cls, mrz_text: str, format_type: str
+    ) -> Tuple[bool, List[str]]:
         """Validate MRZ checksums (simplified implementation).
         Args:
             mrz_text: Raw MRZ text
             format_type: MRZ format type
         Returns:
             Tuple of (is_valid, list_of_errors)
         """
         # This is a simplified implementation
         # In production, you would implement full MRZ checksum validation
         errors = []
         # Basic validation - check for reasonable character distribution
+        if mrz_text.count("<") > len(mrz_text) * 0.3:
             errors.append("Too many fill characters")
         # For now, assume valid if basic format is correct
         is_valid = len(errors) == 0
         return is_valid, errors
 # Backward compatibility - use enhanced extractor as default
 class FieldExtractor(EnhancedFieldExtractor):
     """Backward compatible field extractor using enhanced implementation."""
     pass

src/kybtech_dots_ocr/field_extraction.py CHANGED Viewed

@@ -11,100 +11,96 @@ from .api_models import ExtractedField, IdCardFields, MRZData
 class FieldExtractor:
     """Field extraction and mapping from OCR results."""
     # Field mapping patterns for Dutch ID cards
     FIELD_PATTERNS = {
         "document_number": [
             r"documentnummer[:\s]*([A-Z0-9]+)",
             r"document\s*number[:\s]*([A-Z0-9]+)",
-            r"nr[:\s]*([A-Z0-9]+)"
         ],
         "surname": [
             r"achternaam[:\s]*([A-Z]+)",
             r"surname[:\s]*([A-Z]+)",
-            r"family\s*name[:\s]*([A-Z]+)"
         ],
         "given_names": [
             r"voornamen[:\s]*([A-Z]+)",
             r"given\s*names[:\s]*([A-Z]+)",
-            r"first\s*name[:\s]*([A-Z]+)"
         ],
         "nationality": [
             r"nationaliteit[:\s]*([A-Za-z]+)",
-            r"nationality[:\s]*([A-Za-z]+)"
         ],
         "date_of_birth": [
             r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
-            r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
-        ],
-        "gender": [
-            r"geslacht[:\s]*([MF])",
-            r"gender[:\s]*([MF])",
-            r"sex[:\s]*([MF])"
         ],
         "place_of_birth": [
             r"geboorteplaats[:\s]*([A-Za-z\s]+)",
             r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
-            r"born\s*in[:\s]*([A-Za-z\s]+)"
         ],
         "date_of_issue": [
             r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
-            r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
         ],
         "date_of_expiry": [
             r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
-            r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})"
         ],
         "personal_number": [
             r"persoonsnummer[:\s]*(\d{9})",
             r"personal\s*number[:\s]*(\d{9})",
-            r"bsn[:\s]*(\d{9})"
-        ]
     }
     @classmethod
     def extract_fields(cls, ocr_text: str) -> IdCardFields:
         """Extract structured fields from OCR text.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             IdCardFields object with extracted field data
         """
         fields = {}
         for field_name, patterns in cls.FIELD_PATTERNS.items():
             value = None
             confidence = 0.0
             for pattern in patterns:
                 match = re.search(pattern, ocr_text, re.IGNORECASE)
                 if match:
                     value = match.group(1).strip()
                     confidence = 0.8  # Base confidence for pattern match
                     break
             if value:
                 fields[field_name] = ExtractedField(
                     field_name=field_name,
                     value=value,
                     confidence=confidence,
-                    source="ocr"
                 )
         return IdCardFields(**fields)
     @classmethod
     def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
         """Extract MRZ data from OCR text.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             MRZData object if MRZ detected, None otherwise
         """
@@ -113,9 +109,9 @@ class FieldExtractor:
             r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)",  # Generic passport format (try first)
             r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})",  # TD1 format
             r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})",  # TD2 format
-            r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})"  # TD3 format
         ]
         for pattern in mrz_patterns:
             match = re.search(pattern, ocr_text, re.MULTILINE)
             if match:
@@ -123,10 +119,10 @@ class FieldExtractor:
                 # Basic MRZ parsing (simplified)
                 return MRZData(
                     raw_text=raw_mrz,
-                    format_type="TD3" if len(raw_mrz.split('\n')) == 3 else "TD2",
                     is_valid=True,  # Assume valid if present
                     checksum_errors=[],  # Not implemented in basic version
-                    confidence=0.9
                 )
         return None

 class FieldExtractor:
     """Field extraction and mapping from OCR results."""
     # Field mapping patterns for Dutch ID cards
     FIELD_PATTERNS = {
         "document_number": [
             r"documentnummer[:\s]*([A-Z0-9]+)",
             r"document\s*number[:\s]*([A-Z0-9]+)",
+            r"nr[:\s]*([A-Z0-9]+)",
         ],
         "surname": [
             r"achternaam[:\s]*([A-Z]+)",
             r"surname[:\s]*([A-Z]+)",
+            r"family\s*name[:\s]*([A-Z]+)",
         ],
         "given_names": [
             r"voornamen[:\s]*([A-Z]+)",
             r"given\s*names[:\s]*([A-Z]+)",
+            r"first\s*name[:\s]*([A-Z]+)",
         ],
         "nationality": [
             r"nationaliteit[:\s]*([A-Za-z]+)",
+            r"nationality[:\s]*([A-Za-z]+)",
         ],
         "date_of_birth": [
             r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+            r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
         ],
+        "gender": [r"geslacht[:\s]*([MF])", r"gender[:\s]*([MF])", r"sex[:\s]*([MF])"],
         "place_of_birth": [
             r"geboorteplaats[:\s]*([A-Za-z\s]+)",
             r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
+            r"born\s*in[:\s]*([A-Za-z\s]+)",
         ],
         "date_of_issue": [
             r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+            r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
         ],
         "date_of_expiry": [
             r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
             r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
+            r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
         ],
         "personal_number": [
             r"persoonsnummer[:\s]*(\d{9})",
             r"personal\s*number[:\s]*(\d{9})",
+            r"bsn[:\s]*(\d{9})",
+        ],
     }
     @classmethod
     def extract_fields(cls, ocr_text: str) -> IdCardFields:
         """Extract structured fields from OCR text.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             IdCardFields object with extracted field data
         """
         fields = {}
         for field_name, patterns in cls.FIELD_PATTERNS.items():
             value = None
             confidence = 0.0
             for pattern in patterns:
                 match = re.search(pattern, ocr_text, re.IGNORECASE)
                 if match:
                     value = match.group(1).strip()
                     confidence = 0.8  # Base confidence for pattern match
                     break
             if value:
                 fields[field_name] = ExtractedField(
                     field_name=field_name,
                     value=value,
                     confidence=confidence,
+                    source="ocr",
                 )
         return IdCardFields(**fields)
     @classmethod
     def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
         """Extract MRZ data from OCR text.
         Args:
             ocr_text: Raw OCR text from document processing
         Returns:
             MRZData object if MRZ detected, None otherwise
         """
             r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)",  # Generic passport format (try first)
             r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})",  # TD1 format
             r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})",  # TD2 format
+            r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})",  # TD3 format
         ]
         for pattern in mrz_patterns:
             match = re.search(pattern, ocr_text, re.MULTILINE)
             if match:
                 # Basic MRZ parsing (simplified)
                 return MRZData(
                     raw_text=raw_mrz,
+                    format_type="TD3" if len(raw_mrz.split("\n")) == 3 else "TD2",
                     is_valid=True,  # Assume valid if present
                     checksum_errors=[],  # Not implemented in basic version
+                    confidence=0.9,
                 )
         return None

src/kybtech_dots_ocr/models.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
 class ExtractedField(BaseModel):
     """Individual extracted field from identity document."""
     field_name: str = Field(..., description="Standardized field name")
     value: Optional[str] = Field(None, description="Extracted field value")
     confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
@@ -18,10 +19,19 @@ class ExtractedField(BaseModel):
 class IdCardFields(BaseModel):
     """Structured fields extracted from identity documents."""
-    document_number: Optional[ExtractedField] = Field(None, description="Document number/ID")
-    document_type: Optional[ExtractedField] = Field(None, description="Type of document")
-    issuing_country: Optional[ExtractedField] = Field(None, description="Issuing country code")
-    issuing_authority: Optional[ExtractedField] = Field(None, description="Issuing authority")
     # Personal Information
     surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
@@ -34,17 +44,30 @@ class IdCardFields(BaseModel):
     # Validity Information
     date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
     date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
-    personal_number: Optional[ExtractedField] = Field(None, description="Personal number")
     # Additional fields for specific document types
-    optional_data_1: Optional[ExtractedField] = Field(None, description="Optional data field 1")
-    optional_data_2: Optional[ExtractedField] = Field(None, description="Optional data field 2")
 class MRZData(BaseModel):
     """Machine Readable Zone data extracted from identity documents."""
     raw_text: str = Field(..., description="Raw MRZ text as extracted")
-    format_type: str = Field(..., description="MRZ format type (TD1, TD2, TD3, MRVA, MRVB)")
     is_valid: bool = Field(..., description="Whether MRZ checksums are valid")
-    checksum_errors: List[str] = Field(default_factory=list, description="List of checksum validation errors")
-    confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence score")

 class ExtractedField(BaseModel):
     """Individual extracted field from identity document."""
     field_name: str = Field(..., description="Standardized field name")
     value: Optional[str] = Field(None, description="Extracted field value")
     confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence")
 class IdCardFields(BaseModel):
     """Structured fields extracted from identity documents."""
+    document_number: Optional[ExtractedField] = Field(
+        None, description="Document number/ID"
+    )
+    document_type: Optional[ExtractedField] = Field(
+        None, description="Type of document"
+    )
+    issuing_country: Optional[ExtractedField] = Field(
+        None, description="Issuing country code"
+    )
+    issuing_authority: Optional[ExtractedField] = Field(
+        None, description="Issuing authority"
+    )
     # Personal Information
     surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
     # Validity Information
     date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
     date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
+    personal_number: Optional[ExtractedField] = Field(
+        None, description="Personal number"
+    )
     # Additional fields for specific document types
+    optional_data_1: Optional[ExtractedField] = Field(
+        None, description="Optional data field 1"
+    )
+    optional_data_2: Optional[ExtractedField] = Field(
+        None, description="Optional data field 2"
+    )
 class MRZData(BaseModel):
     """Machine Readable Zone data extracted from identity documents."""
     raw_text: str = Field(..., description="Raw MRZ text as extracted")
+    format_type: str = Field(
+        ..., description="MRZ format type (TD1, TD2, TD3, MRVA, MRVB)"
+    )
     is_valid: bool = Field(..., description="Whether MRZ checksums are valid")
+    checksum_errors: List[str] = Field(
+        default_factory=list, description="List of checksum validation errors"
+    )
+    confidence: float = Field(
+        ..., ge=0.0, le=1.0, description="Extraction confidence score"
+    )

src/kybtech_dots_ocr/preprocessing.py CHANGED Viewed

@@ -21,15 +21,19 @@ logger = logging.getLogger(__name__)
 # Environment variable configuration
 PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
 PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
-IMAGE_MAX_SIZE = int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024  # 10MB default
 class ImagePreprocessor:
     """Handles image preprocessing for Dots.OCR model."""
-    def __init__(self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28):
         """Initialize the image preprocessor.
         Args:
             min_pixels: Minimum pixel count for images
             max_pixels: Maximum pixel count for images
@@ -38,29 +42,29 @@ class ImagePreprocessor:
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
         self.divisor = divisor
     def preprocess_image(self, image: Image.Image) -> Image.Image:
         """Preprocess an image to meet model requirements.
         Args:
             image: Input PIL Image
         Returns:
             Preprocessed PIL Image
         """
         # Convert to RGB if necessary
         if image.mode != "RGB":
             image = image.convert("RGB")
         # Auto-orient image based on EXIF data
         image = ImageOps.exif_transpose(image)
         # Calculate current pixel count
         width, height = image.size
         current_pixels = width * height
         logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
         # Resize if necessary to meet pixel requirements
         if current_pixels < self.min_pixels:
             # Scale up to meet minimum pixel requirement
@@ -69,7 +73,7 @@ class ImagePreprocessor:
             new_height = int(height * scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
             logger.info(f"Scaled up image to {new_width}x{new_height}")
         elif current_pixels > self.max_pixels:
             # Scale down to meet maximum pixel requirement
             scale_factor = (self.max_pixels / current_pixels) ** 0.5
@@ -77,69 +81,73 @@ class ImagePreprocessor:
             new_height = int(height * scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
             logger.info(f"Scaled down image to {new_width}x{new_height}")
         # Ensure dimensions are divisible by the required divisor
         width, height = image.size
         new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
         new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
         if new_width != width or new_height != height:
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            logger.info(f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}")
         return image
-    def crop_by_roi(self, image: Image.Image, roi: Tuple[float, float, float, float]) -> Image.Image:
         """Crop image using ROI coordinates.
         Args:
             image: Input PIL Image
             roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
         Returns:
             Cropped PIL Image
         """
         x1, y1, x2, y2 = roi
         width, height = image.size
         # Convert normalized coordinates to pixel coordinates
         x1_px = int(x1 * width)
         y1_px = int(y1 * height)
         x2_px = int(x2 * width)
         y2_px = int(y2 * height)
         # Ensure coordinates are within image bounds
         x1_px = max(0, min(x1_px, width))
         y1_px = max(0, min(y1_px, height))
         x2_px = max(x1_px, min(x2_px, width))
         y2_px = max(y1_px, min(y2_px, height))
         # Crop the image
         cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
         logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
         return cropped
 class PDFProcessor:
     """Handles PDF to image conversion and multi-page processing."""
     def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
         """Initialize the PDF processor.
         Args:
             dpi: DPI for PDF to image conversion
             max_pages: Maximum number of pages to process
         """
         self.dpi = dpi
         self.max_pages = max_pages
     def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
         """Convert PDF to list of images.
         Args:
             pdf_data: PDF file data as bytes
         Returns:
             List of PIL Images, one per page
         """
@@ -147,49 +155,49 @@ class PDFProcessor:
             # Open PDF from bytes
             pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
             images = []
             # Limit number of pages to process
             num_pages = min(len(pdf_document), self.max_pages)
             logger.info(f"Processing {num_pages} pages from PDF")
             for page_num in range(num_pages):
                 page = pdf_document[page_num]
                 # Convert page to image
                 mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)  # 72 is default DPI
                 pix = page.get_pixmap(matrix=mat)
                 # Convert to PIL Image
                 img_data = pix.tobytes("png")
                 image = Image.open(io.BytesIO(img_data))
                 images.append(image)
                 logger.info(f"Converted page {page_num + 1} to image: {image.size}")
             pdf_document.close()
             return images
         except Exception as e:
             logger.error(f"Failed to convert PDF to images: {e}")
             raise RuntimeError(f"PDF conversion failed: {e}")
     def is_pdf(self, file_data: bytes) -> bool:
         """Check if file data is a PDF.
         Args:
             file_data: File data as bytes
         Returns:
             True if file is a PDF
         """
-        return file_data.startswith(b'%PDF-')
     def get_pdf_page_count(self, pdf_data: bytes) -> int:
         """Get the number of pages in a PDF.
         Args:
             pdf_data: PDF file data as bytes
         Returns:
             Number of pages in the PDF
         """
@@ -205,23 +213,21 @@ class PDFProcessor:
 class DocumentProcessor:
     """Main document processing class that handles both images and PDFs."""
     def __init__(self):
         """Initialize the document processor."""
         self.image_preprocessor = ImagePreprocessor()
         self.pdf_processor = PDFProcessor()
     def process_document(
-        self,
-        file_data: bytes,
-        roi: Optional[Tuple[float, float, float, float]] = None
     ) -> List[Image.Image]:
         """Process a document (image or PDF) and return preprocessed images.
         Args:
             file_data: Document file data as bytes
             roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
         Returns:
             List of preprocessed PIL Images
         """
@@ -238,7 +244,7 @@ class DocumentProcessor:
             except Exception as e:
                 logger.error(f"Failed to open image: {e}")
                 raise RuntimeError(f"Image processing failed: {e}")
         # Preprocess each image
         processed_images = []
         for i, image in enumerate(images):
@@ -246,30 +252,30 @@ class DocumentProcessor:
                 # Apply ROI cropping if provided
                 if roi is not None:
                     image = self.image_preprocessor.crop_by_roi(image, roi)
                 # Preprocess image for model requirements
                 processed_image = self.image_preprocessor.preprocess_image(image)
                 processed_images.append(processed_image)
                 logger.info(f"Processed image {i + 1}: {processed_image.size}")
             except Exception as e:
                 logger.error(f"Failed to preprocess image {i + 1}: {e}")
                 # Continue with other images even if one fails
                 continue
         if not processed_images:
             raise RuntimeError("No images could be processed from the document")
         logger.info(f"Successfully processed {len(processed_images)} images")
         return processed_images
     def validate_file_size(self, file_data: bytes) -> bool:
         """Validate that file size is within limits.
         Args:
             file_data: File data as bytes
         Returns:
             True if file size is acceptable
         """
@@ -278,25 +284,25 @@ class DocumentProcessor:
             logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
             return False
         return True
     def get_document_info(self, file_data: bytes) -> dict:
         """Get information about the document.
         Args:
             file_data: Document file data as bytes
         Returns:
             Dictionary with document information
         """
         info = {
             "file_size": len(file_data),
             "is_pdf": self.pdf_processor.is_pdf(file_data),
-            "page_count": 1
         }
         if info["is_pdf"]:
             info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
         return info
@@ -313,8 +319,7 @@ def get_document_processor() -> DocumentProcessor:
 def process_document(
-    file_data: bytes,
-    roi: Optional[Tuple[float, float, float, float]] = None
 ) -> List[Image.Image]:
     """Process a document and return preprocessed images."""
     processor = get_document_processor()

 # Environment variable configuration
 PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
 PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
+IMAGE_MAX_SIZE = (
+    int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024
+)  # 10MB default
 class ImagePreprocessor:
     """Handles image preprocessing for Dots.OCR model."""
+    def __init__(
+        self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28
+    ):
         """Initialize the image preprocessor.
         Args:
             min_pixels: Minimum pixel count for images
             max_pixels: Maximum pixel count for images
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
         self.divisor = divisor
     def preprocess_image(self, image: Image.Image) -> Image.Image:
         """Preprocess an image to meet model requirements.
         Args:
             image: Input PIL Image
         Returns:
             Preprocessed PIL Image
         """
         # Convert to RGB if necessary
         if image.mode != "RGB":
             image = image.convert("RGB")
         # Auto-orient image based on EXIF data
         image = ImageOps.exif_transpose(image)
         # Calculate current pixel count
         width, height = image.size
         current_pixels = width * height
         logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
         # Resize if necessary to meet pixel requirements
         if current_pixels < self.min_pixels:
             # Scale up to meet minimum pixel requirement
             new_height = int(height * scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
             logger.info(f"Scaled up image to {new_width}x{new_height}")
         elif current_pixels > self.max_pixels:
             # Scale down to meet maximum pixel requirement
             scale_factor = (self.max_pixels / current_pixels) ** 0.5
             new_height = int(height * scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
             logger.info(f"Scaled down image to {new_width}x{new_height}")
         # Ensure dimensions are divisible by the required divisor
         width, height = image.size
         new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
         new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
         if new_width != width or new_height != height:
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(
+                f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}"
+            )
         return image
+    def crop_by_roi(
+        self, image: Image.Image, roi: Tuple[float, float, float, float]
+    ) -> Image.Image:
         """Crop image using ROI coordinates.
         Args:
             image: Input PIL Image
             roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
         Returns:
             Cropped PIL Image
         """
         x1, y1, x2, y2 = roi
         width, height = image.size
         # Convert normalized coordinates to pixel coordinates
         x1_px = int(x1 * width)
         y1_px = int(y1 * height)
         x2_px = int(x2 * width)
         y2_px = int(y2 * height)
         # Ensure coordinates are within image bounds
         x1_px = max(0, min(x1_px, width))
         y1_px = max(0, min(y1_px, height))
         x2_px = max(x1_px, min(x2_px, width))
         y2_px = max(y1_px, min(y2_px, height))
         # Crop the image
         cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
         logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
         return cropped
 class PDFProcessor:
     """Handles PDF to image conversion and multi-page processing."""
     def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
         """Initialize the PDF processor.
         Args:
             dpi: DPI for PDF to image conversion
             max_pages: Maximum number of pages to process
         """
         self.dpi = dpi
         self.max_pages = max_pages
     def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
         """Convert PDF to list of images.
         Args:
             pdf_data: PDF file data as bytes
         Returns:
             List of PIL Images, one per page
         """
             # Open PDF from bytes
             pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
             images = []
             # Limit number of pages to process
             num_pages = min(len(pdf_document), self.max_pages)
             logger.info(f"Processing {num_pages} pages from PDF")
             for page_num in range(num_pages):
                 page = pdf_document[page_num]
                 # Convert page to image
                 mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)  # 72 is default DPI
                 pix = page.get_pixmap(matrix=mat)
                 # Convert to PIL Image
                 img_data = pix.tobytes("png")
                 image = Image.open(io.BytesIO(img_data))
                 images.append(image)
                 logger.info(f"Converted page {page_num + 1} to image: {image.size}")
             pdf_document.close()
             return images
         except Exception as e:
             logger.error(f"Failed to convert PDF to images: {e}")
             raise RuntimeError(f"PDF conversion failed: {e}")
     def is_pdf(self, file_data: bytes) -> bool:
         """Check if file data is a PDF.
         Args:
             file_data: File data as bytes
         Returns:
             True if file is a PDF
         """
+        return file_data.startswith(b"%PDF-")
     def get_pdf_page_count(self, pdf_data: bytes) -> int:
         """Get the number of pages in a PDF.
         Args:
             pdf_data: PDF file data as bytes
         Returns:
             Number of pages in the PDF
         """
 class DocumentProcessor:
     """Main document processing class that handles both images and PDFs."""
     def __init__(self):
         """Initialize the document processor."""
         self.image_preprocessor = ImagePreprocessor()
         self.pdf_processor = PDFProcessor()
     def process_document(
+        self, file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
     ) -> List[Image.Image]:
         """Process a document (image or PDF) and return preprocessed images.
         Args:
             file_data: Document file data as bytes
             roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
         Returns:
             List of preprocessed PIL Images
         """
             except Exception as e:
                 logger.error(f"Failed to open image: {e}")
                 raise RuntimeError(f"Image processing failed: {e}")
         # Preprocess each image
         processed_images = []
         for i, image in enumerate(images):
                 # Apply ROI cropping if provided
                 if roi is not None:
                     image = self.image_preprocessor.crop_by_roi(image, roi)
                 # Preprocess image for model requirements
                 processed_image = self.image_preprocessor.preprocess_image(image)
                 processed_images.append(processed_image)
                 logger.info(f"Processed image {i + 1}: {processed_image.size}")
             except Exception as e:
                 logger.error(f"Failed to preprocess image {i + 1}: {e}")
                 # Continue with other images even if one fails
                 continue
         if not processed_images:
             raise RuntimeError("No images could be processed from the document")
         logger.info(f"Successfully processed {len(processed_images)} images")
         return processed_images
     def validate_file_size(self, file_data: bytes) -> bool:
         """Validate that file size is within limits.
         Args:
             file_data: File data as bytes
         Returns:
             True if file size is acceptable
         """
             logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
             return False
         return True
     def get_document_info(self, file_data: bytes) -> dict:
         """Get information about the document.
         Args:
             file_data: Document file data as bytes
         Returns:
             Dictionary with document information
         """
         info = {
             "file_size": len(file_data),
             "is_pdf": self.pdf_processor.is_pdf(file_data),
+            "page_count": 1,
         }
         if info["is_pdf"]:
             info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
         return info
 def process_document(
+    file_data: bytes, roi: Optional[Tuple[float, float, float, float]] = None
 ) -> List[Image.Image]:
     """Process a document and return preprocessed images."""
     processor = get_document_processor()

src/kybtech_dots_ocr/response_builder.py CHANGED Viewed

@@ -2,9 +2,13 @@
 This module handles the construction and validation of OCR API responses
 according to the specified schema with proper error handling and metadata.
 """
 import logging
 import time
 from typing import List, Optional, Dict, Any
 from datetime import datetime
@@ -29,7 +33,8 @@ class OCRResponseBuilder:
         media_type: str,
         processing_time: float,
         ocr_texts: List[str],
-        page_metadata: Optional[List[Dict[str, Any]]] = None
     ) -> OCRResponse:
         """Build a complete OCR response from extracted texts.
@@ -39,6 +44,7 @@ class OCRResponseBuilder:
             processing_time: Total processing time in seconds
             ocr_texts: List of OCR text results (one per page)
             page_metadata: Optional metadata for each page
         Returns:
             Complete OCRResponse object
@@ -46,6 +52,8 @@ class OCRResponseBuilder:
         logger.info(f"Building response for {len(ocr_texts)} pages")
         detections = []
         for i, ocr_text in enumerate(ocr_texts):
             try:
@@ -53,6 +61,40 @@ class OCRResponseBuilder:
                 extracted_fields = self.field_extractor.extract_fields(ocr_text)
                 mrz_data = self.field_extractor.extract_mrz(ocr_text)
                 # Create detection for this page
                 detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
                 detections.append(detection)
@@ -304,11 +346,19 @@ def build_ocr_response(
     media_type: str,
     processing_time: float,
     ocr_texts: List[str],
-    page_metadata: Optional[List[Dict[str, Any]]] = None
 ) -> OCRResponse:
     """Build a complete OCR response from extracted texts."""
     builder = get_response_builder()
-    return builder.build_response(request_id, media_type, processing_time, ocr_texts, page_metadata)
 def build_error_response(

 This module handles the construction and validation of OCR API responses
 according to the specified schema with proper error handling and metadata.
+Debug-mode logging is supported to surface detailed information about
+extraction results when troubleshooting in environments like Hugging Face.
 """
 import logging
+import os
 import time
 from typing import List, Optional, Dict, Any
 from datetime import datetime
         media_type: str,
         processing_time: float,
         ocr_texts: List[str],
+        page_metadata: Optional[List[Dict[str, Any]]] = None,
+        debug: bool = False,
     ) -> OCRResponse:
         """Build a complete OCR response from extracted texts.
             processing_time: Total processing time in seconds
             ocr_texts: List of OCR text results (one per page)
             page_metadata: Optional metadata for each page
+            debug: When True, emit detailed logs about OCR text and mapping
         Returns:
             Complete OCRResponse object
         logger.info(f"Building response for {len(ocr_texts)} pages")
         detections = []
+        # Allow configuring the OCR text snippet length via env var. Defaults to 1200.
+        debug_snippet_len = int(os.getenv("DOTS_OCR_DEBUG_TEXT_SNIPPET_LEN", "1200"))
         for i, ocr_text in enumerate(ocr_texts):
             try:
                 extracted_fields = self.field_extractor.extract_fields(ocr_text)
                 mrz_data = self.field_extractor.extract_mrz(ocr_text)
+                # In debug mode, log OCR text snippet and extracted mapping details.
+                if debug:
+                    # Log a bounded snippet of the OCR text to avoid overwhelming logs
+                    snippet = ocr_text[:debug_snippet_len]
+                    if len(ocr_text) > debug_snippet_len:
+                        snippet += "\n...[truncated]"
+                    logger.info(
+                        f"[debug] Page {i + 1}: OCR text snippet (len={len(ocr_text)}):\n{snippet}"
+                    )
+                    # Prepare a compact dict of non-null extracted fields
+                    non_null_fields: Dict[str, Any] = {}
+                    for fname, fval in extracted_fields.__dict__.items():
+                        if fval is not None:
+                            non_null_fields[fname] = {
+                                "value": fval.value,
+                                "confidence": fval.confidence,
+                                "source": fval.source,
+                            }
+                    logger.info(
+                        f"[debug] Page {i + 1}: Extracted fields (non-null): {non_null_fields}"
+                    )
+                    if mrz_data is not None:
+                        # Support both canonical and legacy attribute names
+                        raw_mrz = getattr(mrz_data, "raw_mrz", None) or getattr(mrz_data, "raw_text", None)
+                        logger.info(
+                            f"[debug] Page {i + 1}: MRZ detected — type={getattr(mrz_data, 'document_type', None) or getattr(mrz_data, 'format_type', None)}, confidence={mrz_data.confidence:.2f}"
+                        )
+                        if raw_mrz:
+                            logger.info(f"[debug] Page {i + 1}: MRZ raw text:\n{raw_mrz}")
+                    else:
+                        logger.info(f"[debug] Page {i + 1}: No MRZ detected")
                 # Create detection for this page
                 detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
                 detections.append(detection)
     media_type: str,
     processing_time: float,
     ocr_texts: List[str],
+    page_metadata: Optional[List[Dict[str, Any]]] = None,
+    debug: bool = False,
 ) -> OCRResponse:
     """Build a complete OCR response from extracted texts."""
     builder = get_response_builder()
+    return builder.build_response(
+        request_id=request_id,
+        media_type=media_type,
+        processing_time=processing_time,
+        ocr_texts=ocr_texts,
+        page_metadata=page_metadata,
+        debug=debug,
+    )
 def build_error_response(