"""Field extraction utilities for OCR text processing. This module provides field extraction and mapping from OCR results to structured KYB field formats. """ import re from typing import Optional from models import ExtractedField, IdCardFields, MRZData class FieldExtractor: """Field extraction and mapping from OCR results.""" # Field mapping patterns for Dutch ID cards FIELD_PATTERNS = { "document_number": [ r"documentnummer[:\s]*([A-Z0-9]+)", r"document\s*number[:\s]*([A-Z0-9]+)", r"nr[:\s]*([A-Z0-9]+)" ], "surname": [ r"achternaam[:\s]*([A-Z]+)", r"surname[:\s]*([A-Z]+)", r"family\s*name[:\s]*([A-Z]+)" ], "given_names": [ r"voornamen[:\s]*([A-Z]+)", r"given\s*names[:\s]*([A-Z]+)", r"first\s*name[:\s]*([A-Z]+)" ], "nationality": [ r"nationaliteit[:\s]*([A-Za-z]+)", r"nationality[:\s]*([A-Za-z]+)" ], "date_of_birth": [ r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})" ], "gender": [ r"geslacht[:\s]*([MF])", r"gender[:\s]*([MF])", r"sex[:\s]*([MF])" ], "place_of_birth": [ r"geboorteplaats[:\s]*([A-Za-z\s]+)", r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)", r"born\s*in[:\s]*([A-Za-z\s]+)" ], "date_of_issue": [ r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})" ], "date_of_expiry": [ r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})" ], "personal_number": [ r"persoonsnummer[:\s]*(\d{9})", r"personal\s*number[:\s]*(\d{9})", r"bsn[:\s]*(\d{9})" ] } @classmethod def extract_fields(cls, ocr_text: str) -> IdCardFields: """Extract structured fields from OCR text. Args: ocr_text: Raw OCR text from document processing Returns: IdCardFields object with extracted field data """ fields = {} for field_name, patterns in cls.FIELD_PATTERNS.items(): value = None confidence = 0.0 for pattern in patterns: match = re.search(pattern, ocr_text, re.IGNORECASE) if match: value = match.group(1).strip() confidence = 0.8 # Base confidence for pattern match break if value: fields[field_name] = ExtractedField( field_name=field_name, value=value, confidence=confidence, source="ocr" ) return IdCardFields(**fields) @classmethod def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]: """Extract MRZ data from OCR text. Args: ocr_text: Raw OCR text from document processing Returns: MRZData object if MRZ detected, None otherwise """ # Look for MRZ patterns (TD1, TD2, TD3) mrz_patterns = [ r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first) r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})" # TD3 format ] for pattern in mrz_patterns: match = re.search(pattern, ocr_text, re.MULTILINE) if match: raw_mrz = match.group(1) # Basic MRZ parsing (simplified) return MRZData( raw_text=raw_mrz, format_type="TD3" if len(raw_mrz.split('\n')) == 3 else "TD2", is_valid=True, # Assume valid if present checksum_errors=[], # Not implemented in basic version confidence=0.9 ) return None