Spaces:
Paused
Paused
| """API models for Dots.OCR text extraction service. | |
| This module defines the data structures used for API requests, | |
| responses, and internal data processing. | |
| """ | |
| from typing import List, Optional, Dict, Any | |
| from pydantic import BaseModel, Field | |
| class BoundingBox(BaseModel): | |
| """Normalized bounding box coordinates.""" | |
| x1: float = Field(..., ge=0.0, le=1.0, description="Top-left x coordinate") | |
| y1: float = Field(..., ge=0.0, le=1.0, description="Top-left y coordinate") | |
| x2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right x coordinate") | |
| y2: float = Field(..., ge=0.0, le=1.0, description="Bottom-right y coordinate") | |
| class ExtractedField(BaseModel): | |
| """Individual extracted field with confidence and source.""" | |
| field_name: str = Field(..., description="Standardized field name") | |
| value: Optional[str] = Field(None, description="Extracted field value") | |
| confidence: float = Field(..., ge=0.0, le=1.0, description="Extraction confidence") | |
| source: str = Field(..., description="Extraction source (e.g., 'ocr')") | |
| class IdCardFields(BaseModel): | |
| """Structured fields extracted from identity documents.""" | |
| document_number: Optional[ExtractedField] = Field( | |
| None, description="Document number/ID" | |
| ) | |
| document_type: Optional[ExtractedField] = Field( | |
| None, description="Type of document" | |
| ) | |
| issuing_country: Optional[ExtractedField] = Field( | |
| None, description="Issuing country code" | |
| ) | |
| issuing_authority: Optional[ExtractedField] = Field( | |
| None, description="Issuing authority" | |
| ) | |
| # Personal Information | |
| surname: Optional[ExtractedField] = Field(None, description="Family name/surname") | |
| given_names: Optional[ExtractedField] = Field(None, description="Given names") | |
| nationality: Optional[ExtractedField] = Field(None, description="Nationality code") | |
| date_of_birth: Optional[ExtractedField] = Field(None, description="Date of birth") | |
| gender: Optional[ExtractedField] = Field(None, description="Gender") | |
| place_of_birth: Optional[ExtractedField] = Field(None, description="Place of birth") | |
| # Validity Information | |
| date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue") | |
| date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry") | |
| personal_number: Optional[ExtractedField] = Field( | |
| None, description="Personal number" | |
| ) | |
| # Additional fields for specific document types | |
| optional_data_1: Optional[ExtractedField] = Field( | |
| None, description="Optional data field 1" | |
| ) | |
| optional_data_2: Optional[ExtractedField] = Field( | |
| None, description="Optional data field 2" | |
| ) | |
| class ExtractedFields(BaseModel): | |
| """All extracted fields from identity document.""" | |
| document_number: Optional[ExtractedField] = None | |
| document_type: Optional[ExtractedField] = None | |
| issuing_country: Optional[ExtractedField] = None | |
| issuing_authority: Optional[ExtractedField] = None | |
| surname: Optional[ExtractedField] = None | |
| given_names: Optional[ExtractedField] = None | |
| nationality: Optional[ExtractedField] = None | |
| date_of_birth: Optional[ExtractedField] = None | |
| gender: Optional[ExtractedField] = None | |
| place_of_birth: Optional[ExtractedField] = None | |
| date_of_issue: Optional[ExtractedField] = None | |
| date_of_expiry: Optional[ExtractedField] = None | |
| personal_number: Optional[ExtractedField] = None | |
| optional_data_1: Optional[ExtractedField] = None | |
| optional_data_2: Optional[ExtractedField] = None | |
| class MRZData(BaseModel): | |
| """Machine Readable Zone data.""" | |
| # Primary canonical fields | |
| document_type: Optional[str] = Field( | |
| None, description="MRZ document type (TD1|TD2|TD3)" | |
| ) | |
| issuing_country: Optional[str] = Field(None, description="Issuing country code") | |
| surname: Optional[str] = Field(None, description="Surname from MRZ") | |
| given_names: Optional[str] = Field(None, description="Given names from MRZ") | |
| document_number: Optional[str] = Field(None, description="Document number from MRZ") | |
| nationality: Optional[str] = Field(None, description="Nationality code from MRZ") | |
| date_of_birth: Optional[str] = Field(None, description="Date of birth from MRZ") | |
| gender: Optional[str] = Field(None, description="Gender from MRZ") | |
| date_of_expiry: Optional[str] = Field(None, description="Date of expiry from MRZ") | |
| personal_number: Optional[str] = Field(None, description="Personal number from MRZ") | |
| raw_mrz: Optional[str] = Field(None, description="Raw MRZ text") | |
| confidence: float = Field( | |
| 0.0, ge=0.0, le=1.0, description="MRZ extraction confidence" | |
| ) | |
| # Backwards compatibility fields (some older code/tests expect these names) | |
| # These duplicate information from the canonical fields above. | |
| format_type: Optional[str] = Field( | |
| None, description="Alias of document_type for backward compatibility" | |
| ) | |
| raw_text: Optional[str] = Field( | |
| None, description="Alias of raw_mrz for backward compatibility" | |
| ) | |
| class OCRDetection(BaseModel): | |
| """Single OCR detection result.""" | |
| mrz_data: Optional[MRZData] = Field(None, description="MRZ data if detected") | |
| extracted_fields: ExtractedFields = Field(..., description="Extracted field data") | |
| class OCRResponse(BaseModel): | |
| """OCR API response.""" | |
| request_id: str = Field(..., description="Unique request identifier") | |
| media_type: str = Field(..., description="Media type processed") | |
| processing_time: float = Field(..., description="Processing time in seconds") | |
| detections: List[OCRDetection] = Field(..., description="List of OCR detections") | |