Spaces:

algoryn
/

dots-ocr-idcard

Paused

App Files Files Community

tommulder commited on Sep 10

Commit

211e423

1 Parent(s): f4349d6

feat(api): fast FastAPI app + model loader refactor; add mock mode for tests\n\n- Add pyproject + setuptools config and console entrypoint\n- Implement enhanced field extraction + MRZ heuristics\n- Add response builder with compatibility for legacy MRZ fields\n- New preprocessing pipeline for PDFs/images\n- HF Spaces GPU: cache ENV, optional flash-attn, configurable base image\n- Add Make targets for Spaces GPU and local CPU\n- Add httpx for TestClient; tests pass in mock mode\n- Remove embedded model files and legacy app/modules

Browse files

Files changed (27) hide show

.gitignore +37 -177
Dockerfile +19 -4
Makefile +104 -0
README.md +34 -10
main.py +27 -0
pyproject.toml +83 -0
requirements.txt +7 -1
scripts/README_TESTING.md +215 -0
scripts/quick_test.py +75 -0
scripts/run_tests.sh +163 -0
scripts/test_api_endpoint.py +407 -0
scripts/test_config.json +54 -0
scripts/test_production.py +79 -0
scripts/test_production_curl.sh +89 -0
setup_dev.py +57 -0
src/kybtech_dots_ocr/__init__.py +32 -0
app.py → src/kybtech_dots_ocr/api_models.py +34 -164
src/kybtech_dots_ocr/app.py +217 -0
src/kybtech_dots_ocr/enhanced_field_extraction.py +403 -0
field_extraction.py → src/kybtech_dots_ocr/field_extraction.py +1 -1
src/kybtech_dots_ocr/model_loader.py +313 -0
models.py → src/kybtech_dots_ocr/models.py +0 -0
src/kybtech_dots_ocr/preprocessing.py +333 -0
src/kybtech_dots_ocr/response_builder.py +321 -0
tests/__init__.py +1 -0
tests/test_app.py +38 -0
tests/test_field_extraction.py +88 -0

.gitignore CHANGED Viewed

@@ -1,12 +1,8 @@
-# Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
-# C extensions
 *.so
-# Distribution / packaging
 .Python
 build/
 develop-eggs/
@@ -20,156 +16,78 @@ parts/
 sdist/
 var/
 wheels/
-pip-wheel-metadata/
-share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
 coverage.xml
 *.cover
-*.py,cover
 .hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
 # Jupyter Notebook
 .ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
 # pyenv
 .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
 # Environments
 .env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
-# Pyre type checker
-.pyre/
 # Data files
-*.csv
-*.json
-*.jsonl
-*.parquet
-*.feather
-*.arrow
 data/
-datasets/
-raw_data/
-processed_data/
-# Hugging Face specific
-.cache/
-huggingface_hub/
-transformers_cache/
-# OpenCV and image processing
 *.jpg
 *.jpeg
 *.png
-*.gif
-*.bmp
-*.tiff
-*.tif
-*.webp
-*.svg
-test_images/
-sample_images/
-uploads/
-temp_images/
-# IDE and editor files
-.vscode/
-.idea/
-*.swp
-*.swo
-*~
-.DS_Store
-Thumbs.db
-# OS generated files
 .DS_Store
 .DS_Store?
 ._*
@@ -178,66 +96,8 @@ Thumbs.db
 ehthumbs.db
 Thumbs.db
-# Logs
-*.log
-logs/
-log/
-# Temporary files
-tmp/
-temp/
-.tmp/
 # Docker
 .dockerignore
-# Local configuration
-config.local.py
-settings.local.py
-.env.local
-.env.development
-.env.test
-.env.production
-# Backup files
-*.bak
-*.backup
-*.old
-# Runtime files
-*.pid
-*.sock
-# Coverage reports
-htmlcov/
-.coverage
-coverage.xml
-# Profiling
-*.prof
-# Jupyter notebook checkpoints
-.ipynb_checkpoints/
-# pytest
-.pytest_cache/
-# Ruff
-.ruff_cache/
-# Black
-.black/
-# isort
-.isort.cfg
-# Pre-commit
-.pre-commit-config.yaml
-# Local development
-local/
-dev/
-development/
 .cursor/
 docs/

+# Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Testing
+.pytest_cache/
+.coverage
 htmlcov/
 .tox/
 .nox/
 coverage.xml
 *.cover
 .hypothesis/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # Environments
 .env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
+# Ruff
+.ruff_cache/
+# Model files (if stored locally)
+models/
+*.bin
+*.safetensors
+*.pt
+*.pth
 # Data files
 data/
 *.jpg
 *.jpeg
 *.png
+*.pdf
+*.mp4
+# Logs
+*.log
+logs/
+# OS
 .DS_Store
 .DS_Store?
 ._*
 ehthumbs.db
 Thumbs.db
 # Docker
 .dockerignore
 .cursor/
 docs/

Dockerfile CHANGED Viewed

@@ -1,4 +1,5 @@
-FROM pytorch/pytorch:2.7.0-cuda12.6-cudnn9-runtime
 # Build args to optionally enable flash-attn installation and override wheel URL
 # Enable by default for Hugging Face Spaces GPU builds; override locally with
@@ -6,6 +7,13 @@ FROM pytorch/pytorch:2.7.0-cuda12.6-cudnn9-runtime
 ARG INSTALL_FLASH_ATTN=true
 ARG FLASH_ATTN_WHEEL_URL=https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
 # Install system dependencies as root
 RUN apt-get update && apt-get install -y \
     libgl1-mesa-dri \
@@ -48,6 +56,9 @@ RUN pip install --no-cache-dir --upgrade pip
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Optionally install flash-attn wheel (requires Python/torch/CUDA compatibility)
 # Will auto-skip if the wheel's Python tag does not match this image's Python.
 RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
@@ -63,11 +74,15 @@ RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
       echo "Skipping flash-attn installation"; \
     fi
-# Copy application code
-COPY --chown=user . .
 # Expose port
 EXPOSE 7860
 # Run the application
-CMD ["python", "app.py"]

+ARG BASE_IMAGE=pytorch/pytorch:2.7.0-cuda12.6-cudnn9-runtime
+FROM ${BASE_IMAGE}
 # Build args to optionally enable flash-attn installation and override wheel URL
 # Enable by default for Hugging Face Spaces GPU builds; override locally with
 ARG INSTALL_FLASH_ATTN=true
 ARG FLASH_ATTN_WHEEL_URL=https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
+# Persist caches and model storage in Spaces, and enable fast transfers
+ENV HF_HUB_ENABLE_HF_TRANSFER=1 \
+    HUGGINGFACE_HUB_CACHE=/data/.cache/huggingface \
+    HF_HOME=/data/.cache/huggingface \
+    DOTS_OCR_LOCAL_DIR=/data/models/dots-ocr \
+    PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512
 # Install system dependencies as root
 RUN apt-get update && apt-get install -y \
     libgl1-mesa-dri \
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy pyproject.toml for package installation
+COPY --chown=user pyproject.toml .
 # Optionally install flash-attn wheel (requires Python/torch/CUDA compatibility)
 # Will auto-skip if the wheel's Python tag does not match this image's Python.
 RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
       echo "Skipping flash-attn installation"; \
     fi
+# Copy source code
+COPY --chown=user src/ ./src/
+COPY --chown=user main.py .
+# Install the package in development mode
+RUN pip install --no-cache-dir -e .
 # Expose port
 EXPOSE 7860
 # Run the application
+CMD ["python", "main.py"]

Makefile ADDED Viewed

	@@ -0,0 +1,104 @@

+.PHONY: help install dev test lint format clean run
+help: ## Show this help message
+	@echo "KYB Tech Dots.OCR - Development Commands"
+	@echo "=========================================="
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+install: ## Install the package
+	uv pip install -e .
+dev: ## Install development dependencies
+	uv pip install -e .[dev]
+test: ## Run tests
+	pytest
+test-verbose: ## Run tests with verbose output
+	pytest -v
+lint: ## Run linting
+	ruff check .
+	mypy src/
+format: ## Format code
+	black .
+	ruff check --fix .
+clean: ## Clean up build artifacts
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info/
+	rm -rf .pytest_cache/
+	rm -rf .mypy_cache/
+	rm -rf .ruff_cache/
+	find . -type d -name __pycache__ -exec rm -rf {} +
+	find . -type f -name "*.pyc" -delete
+run: ## Run the application
+	python main.py
+run-dev: ## Run the application in development mode
+	uvicorn src.kybtech_dots_ocr.app:app --host 0.0.0.0 --port 7860 --reload
+setup: ## Set up development environment
+	python setup_dev.py
+check: ## Run all checks (lint, format, test)
+	$(MAKE) lint
+	$(MAKE) test
+build: ## Build the Docker image
+	docker build -t kybtech-dots-ocr .
+# Build for Hugging Face Spaces GPU (CUDA, optional flash-attn)
+build-spaces-gpu: ## Build Docker image for HF Spaces GPU
+	# Use CUDA runtime base; leave flash-attn on by default (can disable with ARGS)
+	docker build \
+		--build-arg BASE_IMAGE=pytorch/pytorch:2.7.0-cuda12.6-cudnn9-runtime \
+		--build-arg INSTALL_FLASH_ATTN=true \
+		-t kybtech-dots-ocr:spaces-gpu .
+# Build for local Apple Silicon CPU (no CUDA, no flash-attn)
+build-local-cpu: ## Build Docker image for local CPU (arm64)
+	docker build \
+		--platform=linux/arm64 \
+		--build-arg BASE_IMAGE=python:3.12-slim \
+		--build-arg INSTALL_FLASH_ATTN=false \
+		-t kybtech-dots-ocr:cpu .
+run-docker: ## Run the Docker container locally
+	docker run -p 7860:7860 kybtech-dots-ocr
+deploy-staging: ## Deploy to staging (requires HF CLI)
+	@echo "Deploying to staging..."
+	@echo "Make sure you have HF CLI installed and are logged in"
+	@echo "Then push to your staging space repository"
+deploy-production: ## Deploy to production (requires HF CLI)
+	@echo "Deploying to production..."
+	@echo "Make sure you have HF CLI installed and are logged in"
+	@echo "Then push to your production space repository"
+test-local: ## Test the local API endpoint
+	cd scripts && ./run_tests.sh -e local
+test-production: ## Test the production API endpoint
+	cd scripts && ./run_tests.sh -e production
+test-staging: ## Test the staging API endpoint
+	cd scripts && ./run_tests.sh -e staging
+test-quick: ## Quick test with curl (no Python dependencies)
+	cd scripts && ./test_production_curl.sh
+logs: ## Show application logs (if running in Docker)
+	docker logs -f kybtech-dots-ocr
+stop: ## Stop the Docker container
+	docker stop kybtech-dots-ocr || true
+clean-docker: ## Clean up Docker images and containers
+	docker stop kybtech-dots-ocr || true
+	docker rm kybtech-dots-ocr || true
+	docker rmi kybtech-dots-ocr || true

README.md CHANGED Viewed

@@ -178,24 +178,48 @@ The Space will be available at `https://algoryn-dots-ocr-idcard.hf.space` after
 ## 🐳 Local Development
-### Run with Docker
 ```bash
-# Build the image
-docker build -t dots-ocr-api .
-# Run the container
-docker run -p 7860:7860 dots-ocr-api
 ```
-### Run with Python
 ```bash
-# Install dependencies
-pip install -r requirements.txt
-# Run the application
-python app.py
 ```
 ## 📚 Documentation
 - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)

 ## 🐳 Local Development
+### Quick Start with uv
 ```bash
+# Set up development environment
+make setup
+# Activate virtual environment
+source .venv/bin/activate  # On Unix/macOS
+# or
+.venv\Scripts\activate     # On Windows
+# Run the application
+make run-dev
 ```
+### Docker Development
 ```bash
+# Build and run with Docker
+make build
+make run-docker
+# View logs
+make logs
+```
+### Development Commands
+```bash
+# Run tests
+make test
+# Format code
+make format
+# Run linting
+make lint
+# Test API endpoints
+make test-local
+make test-production
 ```
+For detailed development instructions, see the documentation in `docs/`.
 ## 📚 Documentation
 - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)

main.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+"""Main entry point for the KYB Tech Dots.OCR application.
+Provides a callable ``main()`` for console_scripts and direct execution.
+"""
+import os
+import uvicorn
+from src.kybtech_dots_ocr.app import app
+def main() -> None:
+    """Start the FastAPI server with sensible defaults.
+    This function is exposed as a console script via pyproject.toml.
+    Set DOTS_OCR_SKIP_MODEL_LOAD=1 to skip heavy model download for local testing.
+    """
+    # Respect environment overrides for host/port
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", "7860"))
+    log_level = os.getenv("LOG_LEVEL", "info")
+    uvicorn.run(app, host=host, port=port, log_level=log_level)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,83 @@

+[project]
+name = "kybtech-dots-ocr"
+version = "1.0.0"
+description = "Dots.OCR model integration for KYB text extraction"
+authors = [
+    {name = "Algoryn", email = "info@algoryn.com"}
+]
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+    "fastapi>=0.112.1",
+    "uvicorn[standard]>=0.30.6",
+    "python-multipart>=0.0.9",
+    "pydantic>=2.2.0,<3.0.0",
+    "opencv-python>=4.9.0.80",
+    "numpy>=1.26.0",
+    "pillow>=10.3.0",
+    "huggingface_hub",
+    "PyMuPDF>=1.23.0",
+    "torch>=2.0.0",
+    "transformers>=4.40.0",
+    "accelerate>=0.20.0",
+    "qwen-vl-utils",
+    "requests>=2.31.0",
+    "httpx>=0.27.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-asyncio>=0.21.0",
+    "black>=23.0.0",
+    "ruff>=0.1.0",
+    "mypy>=1.0.0",
+]
+[project.scripts]
+kyb-ocr = "main:main"
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.black]
+line-length = 88
+target-version = ['py39']
+[tool.ruff]
+line-length = 88
+target-version = "py39"
+select = ["E", "F", "W", "C90", "I", "N", "UP", "YTT", "S", "BLE", "FBT", "B", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "FA", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM", "TID", "TCH", "INT", "ARG", "PTH", "TD", "FIX", "ERA", "PD", "PGH", "PL", "TRY", "FLY", "NPY", "AIR", "PERF", "FURB", "LOG", "RUF"]
+ignore = ["S101", "PLR0913", "PLR0912", "PLR0915"]
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"]
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+warn_unreachable = true
+strict_equality = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = "-v --tb=short"

requirements.txt CHANGED Viewed

@@ -6,4 +6,10 @@ opencv-python>=4.9.0.80
 numpy>=1.26.0
 pillow>=10.3.0
 huggingface_hub
-PyMuPDF

 numpy>=1.26.0
 pillow>=10.3.0
 huggingface_hub
+PyMuPDF>=1.23.0
+torch>=2.0.0
+transformers>=4.40.0
+accelerate>=0.20.0
+qwen-vl-utils
+requests>=2.31.0
+httpx>=0.27.0

scripts/README_TESTING.md ADDED Viewed

	@@ -0,0 +1,215 @@

+# Dots.OCR API Testing
+This directory contains comprehensive testing scripts for the Dots.OCR API endpoint.
+## Test Scripts
+### 1. `test_api_endpoint.py` - Comprehensive API Testing
+The main testing script that provides full API validation capabilities.
+**Features:**
+- Health check validation
+- Single and multiple image testing
+- ROI (Region of Interest) testing
+- Field extraction validation
+- Response structure validation
+- Performance metrics
+- Detailed error reporting
+**Usage:**
+```bash
+# Basic test with default settings
+python test_api_endpoint.py
+# Test with custom API URL
+python test_api_endpoint.py --url https://your-api.example.com
+# Test with ROI
+python test_api_endpoint.py --roi '{"x1": 0.1, "y1": 0.1, "x2": 0.9, "y2": 0.9}'
+# Test with specific expected fields
+python test_api_endpoint.py --expected-fields document_number surname given_names
+# Verbose output
+python test_api_endpoint.py --verbose
+# Custom timeout
+python test_api_endpoint.py --timeout 60
+```
+**Options:**
+- `--url`: API base URL (default: http://localhost:7860)
+- `--timeout`: Request timeout in seconds (default: 30)
+- `--roi`: ROI coordinates as JSON string
+- `--expected-fields`: List of expected field names to validate
+- `--verbose`: Enable verbose logging
+### 2. `quick_test.py` - Quick Validation
+A simple script for quick API validation after deployment.
+**Usage:**
+```bash
+# Test local API
+python quick_test.py
+# Test remote API
+python quick_test.py https://your-api.example.com
+```
+## Test Configuration
+### `test_config.json`
+Configuration file for test parameters and thresholds.
+**Configuration sections:**
+- `api_endpoints`: Different API URLs for various environments
+- `test_images`: List of test image files
+- `expected_fields`: Fields that should be extracted
+- `roi_test_cases`: Different ROI configurations to test
+- `performance_thresholds`: Performance validation criteria
+- `test_timeout`: Default timeout for requests
+## Test Images
+The following test images are used for validation:
+- `tom_id_card_front.jpg` - Front of Dutch ID card
+- `tom_id_card_back.jpg` - Back of Dutch ID card
+## Testing Scenarios
+### 1. Basic Functionality Test
+```bash
+python test_api_endpoint.py
+```
+Tests basic API functionality with default settings.
+### 2. ROI Testing
+```bash
+python test_api_endpoint.py --roi '{"x1": 0.25, "y1": 0.25, "x2": 0.75, "y2": 0.75}'
+```
+Tests Region of Interest cropping functionality.
+### 3. Field Validation Test
+```bash
+python test_api_endpoint.py --expected-fields document_number surname given_names nationality
+```
+Tests that specific fields are extracted correctly.
+### 4. Performance Test
+```bash
+python test_api_endpoint.py --timeout 60 --verbose
+```
+Tests API performance with extended timeout and detailed logging.
+## Expected Results
+### Successful Test Output
+```
+🔍 Checking API health...
+✅ API is healthy: {'status': 'healthy', 'version': '1.0.0', 'model_loaded': True}
+🚀 Starting API tests with 2 images...
+✅ tom_id_card_front.jpg: 2.45s
+✅ tom_id_card_back.jpg: 1.23s
+📊 Test Results:
+   Total images: 2
+   Successful: 2
+   Failed: 0
+   Success rate: 100.0%
+   Average processing time: 1.84s
+🎉 All tests completed successfully!
+```
+### Field Extraction Example
+```
+Page 1: 11 fields extracted
+  document_number: NLD123456789 (confidence: 0.90)
+  surname: MULDER (confidence: 0.90)
+  given_names: THOMAS JAN (confidence: 0.90)
+  nationality: NLD (confidence: 0.95)
+  date_of_birth: 15-03-1990 (confidence: 0.90)
+  gender: M (confidence: 0.95)
+```
+## Troubleshooting
+### Common Issues
+1. **Connection Refused**
+   - Check if the API is running
+   - Verify the correct URL and port
+   - Check firewall settings
+2. **Timeout Errors**
+   - Increase timeout with `--timeout` parameter
+   - Check API performance and resource usage
+3. **Missing Fields**
+   - Verify test images contain the expected text
+   - Check field extraction patterns in the code
+   - Review API logs for processing errors
+4. **Validation Errors**
+   - Check API response format
+   - Verify model is loaded correctly
+   - Review error logs for details
+### Debug Mode
+Enable verbose logging for detailed debugging:
+```bash
+python test_api_endpoint.py --verbose
+```
+## Integration with CI/CD
+The test scripts can be integrated into CI/CD pipelines:
+```yaml
+# Example GitHub Actions step
+- name: Test API Endpoint
+  run: |
+    python scripts/test_api_endpoint.py --url ${{ env.API_URL }} --timeout 60
+```
+## Performance Monitoring
+The scripts provide performance metrics that can be used for monitoring:
+- Processing time per image
+- Success rate
+- Field extraction accuracy
+- Response validation results
+These metrics can be integrated with monitoring systems like Prometheus or DataDog.
+## 🚀 Production API Testing
+### Current Production Endpoint
+- **URL**: https://algoryn-dots-ocr-idcard.hf.space
+- **Health Check**: https://algoryn-dots-ocr-idcard.hf.space/health
+- **API Docs**: https://algoryn-dots-ocr-idcard.hf.space/docs
+### Quick Production Test
+```bash
+# Test production API
+./run_tests.sh -e production
+# Quick test with curl (no Python dependencies)
+./test_production_curl.sh
+```
+### Staging Environment
+- **Staging URL**: https://algoryn-dots-ocr-idcard-staging.hf.space (to be created)
+- **Purpose**: Safe testing before production deployment
+### Environment-Specific Testing
+```bash
+# Test different environments
+./run_tests.sh -e local      # Local development
+./run_tests.sh -e staging    # Staging environment
+./run_tests.sh -e production # Production environment
+```

scripts/quick_test.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/env python3
+"""Quick API Test Script
+A simple script to quickly test the deployed Dots.OCR API endpoint.
+"""
+import requests
+import json
+import sys
+from pathlib import Path
+def test_api(base_url="http://localhost:7860"):
+    """Quick test of the API endpoint."""
+    print(f"🔍 Testing API at {base_url}")
+    # Health check
+    try:
+        health_response = requests.get(f"{base_url}/health", timeout=10)
+        health_response.raise_for_status()
+        health_data = health_response.json()
+        print(f"✅ Health check passed: {health_data}")
+    except Exception as e:
+        print(f"❌ Health check failed: {e}")
+        return False
+    # Test with front image
+    front_image = Path(__file__).parent / "tom_id_card_front.jpg"
+    if not front_image.exists():
+        print(f"❌ Test image not found: {front_image}")
+        return False
+    print(f"📸 Testing with {front_image.name}")
+    try:
+        with open(front_image, 'rb') as f:
+            files = {'file': f}
+            response = requests.post(
+                f"{base_url}/v1/id/ocr",
+                files=files,
+                timeout=30
+            )
+            response.raise_for_status()
+            result = response.json()
+        print(f"✅ OCR test passed")
+        print(f"   Request ID: {result.get('request_id')}")
+        print(f"   Media type: {result.get('media_type')}")
+        print(f"   Processing time: {result.get('processing_time'):.2f}s")
+        print(f"   Detections: {len(result.get('detections', []))}")
+        # Show extracted fields
+        for i, detection in enumerate(result.get('detections', [])):
+            fields = detection.get('extracted_fields', {})
+            field_count = len([f for f in fields.values() if f is not None])
+            print(f"   Page {i+1}: {field_count} fields extracted")
+            # Show some key fields
+            key_fields = ['document_number', 'surname', 'given_names', 'nationality']
+            for field in key_fields:
+                if field in fields and fields[field] is not None:
+                    value = fields[field].get('value', 'N/A') if isinstance(fields[field], dict) else str(fields[field])
+                    confidence = fields[field].get('confidence', 'N/A') if isinstance(fields[field], dict) else 'N/A'
+                    print(f"     {field}: {value} (confidence: {confidence})")
+        return True
+    except Exception as e:
+        print(f"❌ OCR test failed: {e}")
+        return False
+if __name__ == "__main__":
+    base_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:7860"
+    success = test_api(base_url)
+    sys.exit(0 if success else 1)

scripts/run_tests.sh ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/bin/bash
+# Dots.OCR API Test Runner
+set -e
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+# Default values
+API_URL="http://localhost:7860"
+TIMEOUT=30
+VERBOSE=false
+ROI=""
+ENVIRONMENT="local"
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+# Function to show usage
+show_usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  -u, --url URL          API base URL (default: http://localhost:7860)"
+    echo "  -e, --env ENV          Environment: local, staging, production (default: local)"
+    echo "  -t, --timeout SECONDS  Request timeout (default: 30)"
+    echo "  -r, --roi JSON         ROI coordinates as JSON string"
+    echo "  -v, --verbose          Enable verbose output"
+    echo "  -q, --quick            Run quick test only"
+    echo "  -h, --help             Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0                                    # Basic test (local)"
+    echo "  $0 -e production                     # Test production API"
+    echo "  $0 -e staging                        # Test staging API"
+    echo "  $0 -u https://api.example.com        # Test custom API URL"
+    echo "  $0 -r '{\"x1\":0.1,\"y1\":0.1,\"x2\":0.9,\"y2\":0.9}'  # Test with ROI"
+    echo "  $0 -v -t 60                          # Verbose with 60s timeout"
+    echo "  $0 -q                                # Quick test only"
+}
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -u|--url)
+            API_URL="$2"
+            shift 2
+            ;;
+        -e|--env)
+            ENVIRONMENT="$2"
+            shift 2
+            ;;
+        -t|--timeout)
+            TIMEOUT="$2"
+            shift 2
+            ;;
+        -r|--roi)
+            ROI="$2"
+            shift 2
+            ;;
+        -v|--verbose)
+            VERBOSE=true
+            shift
+            ;;
+        -q|--quick)
+            QUICK=true
+            shift
+            ;;
+        -h|--help)
+            show_usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            show_usage
+            exit 1
+            ;;
+    esac
+done
+# Set API URL based on environment if not explicitly provided
+if [ "$API_URL" = "http://localhost:7860" ] && [ "$ENVIRONMENT" != "local" ]; then
+    case $ENVIRONMENT in
+        "staging")
+            API_URL="https://algoryn-dots-ocr-idcard-staging.hf.space"
+            ;;
+        "production")
+            API_URL="https://algoryn-dots-ocr-idcard.hf.space"
+            ;;
+        *)
+            print_error "Unknown environment: $ENVIRONMENT. Use: local, staging, production"
+            exit 1
+            ;;
+    esac
+fi
+# Check if Python is available
+if ! command -v python3 &> /dev/null; then
+    print_error "Python 3 is required but not installed"
+    exit 1
+fi
+# Check if test images exist
+if [ ! -f "tom_id_card_front.jpg" ] || [ ! -f "tom_id_card_back.jpg" ]; then
+    print_error "Test images not found. Please ensure tom_id_card_front.jpg and tom_id_card_back.jpg are in the scripts directory"
+    exit 1
+fi
+print_status "Starting Dots.OCR API Tests"
+print_status "Environment: $ENVIRONMENT"
+print_status "API URL: $API_URL"
+print_status "Timeout: $TIMEOUT seconds"
+# Run quick test if requested
+if [ "$QUICK" = true ]; then
+    print_status "Running quick test..."
+    if python3 quick_test.py "$API_URL"; then
+        print_success "Quick test passed"
+        exit 0
+    else
+        print_error "Quick test failed"
+        exit 1
+    fi
+fi
+# Build test command
+TEST_CMD="python3 test_api_endpoint.py --url $API_URL --timeout $TIMEOUT"
+if [ "$VERBOSE" = true ]; then
+    TEST_CMD="$TEST_CMD --verbose"
+fi
+if [ -n "$ROI" ]; then
+    TEST_CMD="$TEST_CMD --roi '$ROI'"
+fi
+# Run comprehensive test
+print_status "Running comprehensive API test..."
+if eval $TEST_CMD; then
+    print_success "All tests passed successfully!"
+    exit 0
+else
+    print_error "Tests failed"
+    exit 1
+fi

scripts/test_api_endpoint.py ADDED Viewed

	@@ -0,0 +1,407 @@

+#!/usr/bin/env python3
+"""API Endpoint Test Script for Dots.OCR
+This script tests the deployed Dots.OCR API endpoint using real ID card images.
+It can be used to validate the complete pipeline in a production environment.
+"""
+import os
+import sys
+import json
+import time
+import requests
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+import argparse
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class DotsOCRAPITester:
+    """Test client for the Dots.OCR API endpoint."""
+    def __init__(self, base_url: str, timeout: int = 30):
+        """Initialize the API tester.
+        Args:
+            base_url: Base URL of the deployed API (e.g., "http://localhost:7860")
+            timeout: Request timeout in seconds
+        """
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+        self.session = requests.Session()
+        # Set common headers
+        self.session.headers.update({
+            'User-Agent': 'DotsOCR-APITester/1.0'
+        })
+    def health_check(self) -> Dict[str, Any]:
+        """Check API health status.
+        Returns:
+            Health check response
+        """
+        try:
+            response = self.session.get(
+                f"{self.base_url}/health",
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            logger.error(f"Health check failed: {e}")
+            return {"error": str(e)}
+    def test_ocr_endpoint(
+        self,
+        image_path: str,
+        roi: Optional[Dict[str, float]] = None,
+        expected_fields: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """Test the OCR endpoint with an image file.
+        Args:
+            image_path: Path to the image file
+            roi: Optional ROI coordinates as {x1, y1, x2, y2}
+            expected_fields: List of expected field names to validate
+        Returns:
+            Test results dictionary
+        """
+        logger.info(f"Testing OCR endpoint with {image_path}")
+        # Prepare files and data
+        files = {'file': open(image_path, 'rb')}
+        data = {}
+        if roi:
+            data['roi'] = json.dumps(roi)
+            logger.info(f"Using ROI: {roi}")
+        try:
+            # Make request
+            start_time = time.time()
+            response = self.session.post(
+                f"{self.base_url}/v1/id/ocr",
+                files=files,
+                data=data,
+                timeout=self.timeout
+            )
+            request_time = time.time() - start_time
+            # Close file
+            files['file'].close()
+            # Check response
+            response.raise_for_status()
+            result = response.json()
+            # Validate response structure
+            validation_result = self._validate_response(result)
+            # Check expected fields
+            field_validation = self._validate_expected_fields(result, expected_fields)
+            return {
+                "success": True,
+                "request_time": request_time,
+                "response": result,
+                "validation": validation_result,
+                "field_validation": field_validation,
+                "status_code": response.status_code
+            }
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "status_code": getattr(e.response, 'status_code', None)
+            }
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+        finally:
+            # Ensure file is closed
+            if 'file' in locals():
+                files['file'].close()
+    def _validate_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate the API response structure.
+        Args:
+            response: API response dictionary
+        Returns:
+            Validation results
+        """
+        validation = {
+            "valid": True,
+            "errors": [],
+            "warnings": []
+        }
+        # Required fields
+        required_fields = ['request_id', 'media_type', 'processing_time', 'detections']
+        for field in required_fields:
+            if field not in response:
+                validation["errors"].append(f"Missing required field: {field}")
+                validation["valid"] = False
+        # Validate detections
+        if 'detections' in response:
+            if not isinstance(response['detections'], list):
+                validation["errors"].append("detections must be a list")
+                validation["valid"] = False
+            else:
+                for i, detection in enumerate(response['detections']):
+                    if not isinstance(detection, dict):
+                        validation["errors"].append(f"detection {i} must be a dictionary")
+                        validation["valid"] = False
+                    else:
+                        # Check for extracted_fields
+                        if 'extracted_fields' not in detection:
+                            validation["warnings"].append(f"detection {i} missing extracted_fields")
+                        if 'mrz_data' not in detection:
+                            validation["warnings"].append(f"detection {i} missing mrz_data")
+        # Validate processing time
+        if 'processing_time' in response:
+            if not isinstance(response['processing_time'], (int, float)):
+                validation["errors"].append("processing_time must be a number")
+                validation["valid"] = False
+            elif response['processing_time'] < 0:
+                validation["warnings"].append("processing_time is negative")
+        return validation
+    def _validate_expected_fields(
+        self,
+        response: Dict[str, Any],
+        expected_fields: Optional[List[str]]
+    ) -> Dict[str, Any]:
+        """Validate that expected fields are present in the response.
+        Args:
+            response: API response dictionary
+            expected_fields: List of expected field names
+        Returns:
+            Field validation results
+        """
+        if not expected_fields:
+            return {"valid": True, "found_fields": [], "missing_fields": []}
+        found_fields = []
+        missing_fields = []
+        # Check all detections for fields
+        for i, detection in enumerate(response.get('detections', [])):
+            extracted_fields = detection.get('extracted_fields', {})
+            for field_name in expected_fields:
+                if field_name in extracted_fields and extracted_fields[field_name] is not None:
+                    found_fields.append(f"{field_name} (detection {i})")
+                else:
+                    missing_fields.append(f"{field_name} (detection {i})")
+        return {
+            "valid": len(missing_fields) == 0,
+            "found_fields": found_fields,
+            "missing_fields": missing_fields
+        }
+    def test_multiple_images(
+        self,
+        image_paths: List[str],
+        roi: Optional[Dict[str, float]] = None
+    ) -> Dict[str, Any]:
+        """Test multiple images and return aggregated results.
+        Args:
+            image_paths: List of image file paths
+            roi: Optional ROI coordinates
+        Returns:
+            Aggregated test results
+        """
+        logger.info(f"Testing {len(image_paths)} images")
+        results = []
+        successful_tests = 0
+        total_processing_time = 0
+        for image_path in image_paths:
+            if not os.path.exists(image_path):
+                logger.warning(f"Image not found: {image_path}")
+                results.append({
+                    "image": image_path,
+                    "success": False,
+                    "error": "File not found"
+                })
+                continue
+            result = self.test_ocr_endpoint(image_path, roi)
+            results.append({
+                "image": image_path,
+                **result
+            })
+            if result.get("success", False):
+                successful_tests += 1
+                total_processing_time += result.get("request_time", 0)
+        return {
+            "total_images": len(image_paths),
+            "successful_tests": successful_tests,
+            "failed_tests": len(image_paths) - successful_tests,
+            "success_rate": successful_tests / len(image_paths) if image_paths else 0,
+            "average_processing_time": total_processing_time / successful_tests if successful_tests > 0 else 0,
+            "results": results
+        }
+def main():
+    """Main test function."""
+    parser = argparse.ArgumentParser(description="Test Dots.OCR API endpoint")
+    parser.add_argument(
+        "--url",
+        default="http://localhost:7860",
+        help="API base URL (default: http://localhost:7860)"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=30,
+        help="Request timeout in seconds (default: 30)"
+    )
+    parser.add_argument(
+        "--roi",
+        type=str,
+        help="ROI coordinates as JSON string (e.g., '{\"x1\": 0.1, \"y1\": 0.1, \"x2\": 0.9, \"y2\": 0.9}')"
+    )
+    parser.add_argument(
+        "--expected-fields",
+        nargs="+",
+        help="Expected field names to validate (e.g., document_number surname given_names)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    args = parser.parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    # Parse ROI if provided
+    roi = None
+    if args.roi:
+        try:
+            roi = json.loads(args.roi)
+        except json.JSONDecodeError as e:
+            logger.error(f"Invalid ROI JSON: {e}")
+            sys.exit(1)
+    # Initialize tester
+    tester = DotsOCRAPITester(args.url, args.timeout)
+    # Health check
+    logger.info("🔍 Checking API health...")
+    health = tester.health_check()
+    if "error" in health:
+        logger.error(f"❌ API health check failed: {health['error']}")
+        sys.exit(1)
+    logger.info(f"✅ API is healthy: {health}")
+    # Test images
+    test_images = [
+        "tom_id_card_front.jpg",
+        "tom_id_card_back.jpg"
+    ]
+    # Check if test images exist
+    existing_images = []
+    for image in test_images:
+        image_path = Path(__file__).parent / image
+        if image_path.exists():
+            existing_images.append(str(image_path))
+        else:
+            logger.warning(f"Test image not found: {image_path}")
+    if not existing_images:
+        logger.error("❌ No test images found")
+        sys.exit(1)
+    # Expected fields for validation
+    expected_fields = args.expected_fields or [
+        "document_number",
+        "surname",
+        "given_names",
+        "nationality",
+        "date_of_birth",
+        "gender"
+    ]
+    # Run tests
+    logger.info(f"🚀 Starting API tests with {len(existing_images)} images...")
+    if len(existing_images) == 1:
+        # Single image test
+        result = tester.test_ocr_endpoint(existing_images[0], roi, expected_fields)
+        if result["success"]:
+            logger.info("✅ Single image test passed")
+            logger.info(f"⏱️  Processing time: {result['request_time']:.2f}s")
+            logger.info(f"📄 Detections: {len(result['response']['detections'])}")
+            # Print field validation results
+            field_validation = result.get("field_validation", {})
+            if field_validation.get("found_fields"):
+                logger.info(f"✅ Found fields: {', '.join(field_validation['found_fields'])}")
+            if field_validation.get("missing_fields"):
+                logger.warning(f"⚠️  Missing fields: {', '.join(field_validation['missing_fields'])}")
+        else:
+            logger.error(f"❌ Single image test failed: {result.get('error', 'Unknown error')}")
+            sys.exit(1)
+    else:
+        # Multiple images test
+        results = tester.test_multiple_images(existing_images, roi)
+        logger.info(f"📊 Test Results:")
+        logger.info(f"   Total images: {results['total_images']}")
+        logger.info(f"   Successful: {results['successful_tests']}")
+        logger.info(f"   Failed: {results['failed_tests']}")
+        logger.info(f"   Success rate: {results['success_rate']:.1%}")
+        logger.info(f"   Average processing time: {results['average_processing_time']:.2f}s")
+        # Print detailed results
+        for result in results["results"]:
+            image_name = Path(result["image"]).name
+            if result["success"]:
+                logger.info(f"   ✅ {image_name}: {result['request_time']:.2f}s")
+            else:
+                logger.error(f"   ❌ {image_name}: {result.get('error', 'Unknown error')}")
+        if results["failed_tests"] > 0:
+            sys.exit(1)
+    logger.info("🎉 All tests completed successfully!")
+if __name__ == "__main__":
+    main()

scripts/test_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "api_endpoints": {
+    "local": "http://localhost:7860",
+    "staging": "https://algoryn-dots-ocr-idcard-staging.hf.space",
+    "production": "https://algoryn-dots-ocr-idcard.hf.space"
+  },
+  "test_images": [
+    "tom_id_card_front.jpg",
+    "tom_id_card_back.jpg"
+  ],
+  "expected_fields": [
+    "document_number",
+    "surname",
+    "given_names",
+    "nationality",
+    "date_of_birth",
+    "gender",
+    "date_of_issue",
+    "date_of_expiry"
+  ],
+  "roi_test_cases": [
+    {
+      "name": "full_image",
+      "roi": null,
+      "description": "Process entire image"
+    },
+    {
+      "name": "center_crop",
+      "roi": {
+        "x1": 0.25,
+        "y1": 0.25,
+        "x2": 0.75,
+        "y2": 0.75
+      },
+      "description": "Process center 50% of image"
+    },
+    {
+      "name": "top_half",
+      "roi": {
+        "x1": 0.0,
+        "y1": 0.0,
+        "x2": 1.0,
+        "y2": 0.5
+      },
+      "description": "Process top half of image"
+    }
+  ],
+  "performance_thresholds": {
+    "max_processing_time": 10.0,
+    "min_confidence": 0.7,
+    "min_fields_extracted": 3
+  },
+  "test_timeout": 30
+}

scripts/test_production.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python3
+"""Production API Test Script
+Quick test script specifically for the production Dots.OCR API.
+"""
+import requests
+import json
+import sys
+from pathlib import Path
+def test_production_api():
+    """Test the production API endpoint."""
+    api_url = "https://algoryn-dots-ocr-idcard.hf.space"
+    print(f"🔍 Testing Production API at {api_url}")
+    # Health check
+    try:
+        print("📡 Checking API health...")
+        health_response = requests.get(f"{api_url}/health", timeout=10)
+        health_response.raise_for_status()
+        health_data = health_response.json()
+        print(f"✅ Health check passed: {health_data}")
+    except Exception as e:
+        print(f"❌ Health check failed: {e}")
+        return False
+    # Test with front image
+    front_image = Path(__file__).parent / "tom_id_card_front.jpg"
+    if not front_image.exists():
+        print(f"❌ Test image not found: {front_image}")
+        return False
+    print(f"📸 Testing OCR with {front_image.name}")
+    try:
+        with open(front_image, 'rb') as f:
+            files = {'file': f}
+            response = requests.post(
+                f"{api_url}/v1/id/ocr",
+                files=files,
+                timeout=60  # Longer timeout for production
+            )
+            response.raise_for_status()
+            result = response.json()
+        print(f"✅ OCR test passed")
+        print(f"   Request ID: {result.get('request_id')}")
+        print(f"   Media type: {result.get('media_type')}")
+        print(f"   Processing time: {result.get('processing_time'):.2f}s")
+        print(f"   Detections: {len(result.get('detections', []))}")
+        # Show extracted fields
+        for i, detection in enumerate(result.get('detections', [])):
+            fields = detection.get('extracted_fields', {})
+            field_count = len([f for f in fields.values() if f is not None])
+            print(f"   Page {i+1}: {field_count} fields extracted")
+            # Show some key fields
+            key_fields = ['document_number', 'surname', 'given_names', 'nationality']
+            for field in key_fields:
+                if field in fields and fields[field] is not None:
+                    value = fields[field].get('value', 'N/A') if isinstance(fields[field], dict) else str(fields[field])
+                    confidence = fields[field].get('confidence', 'N/A') if isinstance(fields[field], dict) else 'N/A'
+                    print(f"     {field}: {value} (confidence: {confidence})")
+        return True
+    except Exception as e:
+        print(f"❌ OCR test failed: {e}")
+        if hasattr(e, 'response') and e.response is not None:
+            print(f"   Status code: {e.response.status_code}")
+            print(f"   Response: {e.response.text}")
+        return False
+if __name__ == "__main__":
+    success = test_production_api()
+    sys.exit(0 if success else 1)

scripts/test_production_curl.sh ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/bin/bash
+# Production API Test using curl
+set -e
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+# Production API URL
+API_URL="https://algoryn-dots-ocr-idcard.hf.space"
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+# Check if test image exists
+if [ ! -f "tom_id_card_front.jpg" ]; then
+    print_error "Test image not found: tom_id_card_front.jpg"
+    exit 1
+fi
+print_status "Testing Production API at $API_URL"
+# Health check
+print_status "Checking API health..."
+if curl -s -f "$API_URL/health" > /dev/null; then
+    print_success "Health check passed"
+else
+    print_error "Health check failed"
+    exit 1
+fi
+# Test OCR endpoint
+print_status "Testing OCR endpoint with tom_id_card_front.jpg"
+# Make the API request
+response=$(curl -s -w "\n%{http_code}" -X POST \
+    -F "file=@tom_id_card_front.jpg" \
+    "$API_URL/v1/id/ocr")
+# Split response and status code
+http_code=$(echo "$response" | tail -n1)
+response_body=$(echo "$response" | head -n -1)
+if [ "$http_code" -eq 200 ]; then
+    print_success "OCR request successful"
+    # Parse and display results
+    echo "$response_body" | jq -r '.request_id' | while read request_id; do
+        echo "Request ID: $request_id"
+    done
+    echo "$response_body" | jq -r '.processing_time' | while read processing_time; do
+        echo "Processing time: ${processing_time}s"
+    done
+    echo "$response_body" | jq -r '.detections | length' | while read detection_count; do
+        echo "Detections: $detection_count"
+    done
+    # Show extracted fields
+    echo "$response_body" | jq -r '.detections[0].extracted_fields | to_entries[] | select(.value != null) | "\(.key): \(.value.value) (confidence: \(.value.confidence))"' | while read field_info; do
+        echo "  $field_info"
+    done
+    print_success "Production API test completed successfully!"
+else
+    print_error "OCR request failed with status code: $http_code"
+    echo "Response: $response_body"
+    exit 1
+fi

setup_dev.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python3
+"""Development setup script for KYB Tech Dots.OCR."""
+import subprocess
+import sys
+from pathlib import Path
+def run_command(cmd, description):
+    """Run a command and handle errors."""
+    print(f"🔄 {description}...")
+    try:
+        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+        print(f"✅ {description} completed")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ {description} failed: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+def main():
+    """Set up development environment."""
+    print("🚀 Setting up KYB Tech Dots.OCR development environment...")
+    # Check if uv is installed
+    if not run_command("uv --version", "Checking uv installation"):
+        print("📦 Installing uv...")
+        if not run_command("curl -LsSf https://astral.sh/uv/install.sh | sh", "Installing uv"):
+            print("❌ Failed to install uv. Please install it manually from https://github.com/astral-sh/uv")
+            sys.exit(1)
+    # Create virtual environment
+    if not run_command("uv venv", "Creating virtual environment"):
+        sys.exit(1)
+    # Install dependencies
+    if not run_command("uv pip install -e .", "Installing package in development mode"):
+        sys.exit(1)
+    # Install development dependencies
+    if not run_command("uv pip install -e .[dev]", "Installing development dependencies"):
+        sys.exit(1)
+    print("\n🎉 Development environment setup complete!")
+    print("\n📋 Next steps:")
+    print("1. Activate the virtual environment:")
+    print("   source .venv/bin/activate  # On Unix/macOS")
+    print("   .venv\\Scripts\\activate     # On Windows")
+    print("\n2. Run the application:")
+    print("   python main.py")
+    print("\n3. Run tests:")
+    print("   pytest")
+    print("\n4. Run linting:")
+    print("   ruff check .")
+    print("   black .")
+if __name__ == "__main__":
+    main()

src/kybtech_dots_ocr/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""KYB Tech Dots.OCR Package
+A FastAPI application for identity document text extraction using Dots.OCR model.
+"""
+__version__ = "1.0.0"
+__author__ = "Algoryn"
+__email__ = "info@algoryn.com"
+from .app import app
+from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
+from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
+from .preprocessing import process_document, validate_file_size, get_document_info
+from .response_builder import build_ocr_response, build_error_response
+__all__ = [
+    "app",
+    "OCRResponse",
+    "OCRDetection",
+    "ExtractedFields",
+    "MRZData",
+    "ExtractedField",
+    "load_model",
+    "extract_text",
+    "is_model_loaded",
+    "get_model_info",
+    "process_document",
+    "validate_file_size",
+    "get_document_info",
+    "build_ocr_response",
+    "build_error_response",
+]

app.py → src/kybtech_dots_ocr/api_models.py RENAMED Viewed

@@ -1,44 +1,11 @@
-"""HF Dots.OCR Text Extraction Endpoint
-This FastAPI application provides a Hugging Face Space endpoint for Dots.OCR
-text extraction with ROI support and standardized field extraction schema.
 """
-import logging
-import time
-import uuid
-import json
-import re
 from typing import List, Optional, Dict, Any
-from contextlib import asynccontextmanager
-import cv2
-import numpy as np
-from fastapi import FastAPI, File, Form, HTTPException, UploadFile
-from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
-import torch
-from PIL import Image
-import io
-import base64
-# Dots.OCR imports
-try:
-    from dots_ocr import DotsOCR
-    DOTS_OCR_AVAILABLE = True
-except ImportError:
-    DOTS_OCR_AVAILABLE = False
-    logging.warning("Dots.OCR not available - using mock implementation")
-# Import local field extraction utilities
-from field_extraction import FieldExtractor
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Global model instance
-dots_ocr_model = None
 class BoundingBox(BaseModel):
@@ -57,6 +24,31 @@ class ExtractedField(BaseModel):
     source: str = Field(..., description="Extraction source (e.g., 'ocr')")
 class ExtractedFields(BaseModel):
     """All extracted fields from identity document."""
     document_number: Optional[ExtractedField] = None
@@ -78,6 +70,7 @@ class ExtractedFields(BaseModel):
 class MRZData(BaseModel):
     """Machine Readable Zone data."""
     document_type: Optional[str] = Field(None, description="MRZ document type (TD1|TD2|TD3)")
     issuing_country: Optional[str] = Field(None, description="Issuing country code")
     surname: Optional[str] = Field(None, description="Surname from MRZ")
@@ -91,6 +84,11 @@ class MRZData(BaseModel):
     raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
     confidence: float = Field(0.0, ge=0.0, le=1.0, description="MRZ extraction confidence")
 class OCRDetection(BaseModel):
     """Single OCR detection result."""
@@ -104,131 +102,3 @@ class OCRResponse(BaseModel):
     media_type: str = Field(..., description="Media type processed")
     processing_time: float = Field(..., description="Processing time in seconds")
     detections: List[OCRDetection] = Field(..., description="List of OCR detections")
-# FieldExtractor is now imported from the shared module
-def crop_image_by_roi(image: np.ndarray, roi: BoundingBox) -> np.ndarray:
-    """Crop image using ROI coordinates."""
-    h, w = image.shape[:2]
-    x1 = int(roi.x1 * w)
-    y1 = int(roi.y1 * h)
-    x2 = int(roi.x2 * w)
-    y2 = int(roi.y2 * h)
-    # Ensure coordinates are within image bounds
-    x1 = max(0, min(x1, w))
-    y1 = max(0, min(y1, h))
-    x2 = max(x1, min(x2, w))
-    y2 = max(y1, min(y2, h))
-    return image[y1:y2, x1:x2]
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Application lifespan manager for model loading."""
-    global dots_ocr_model
-    logger.info("Loading Dots.OCR model...")
-    try:
-        if DOTS_OCR_AVAILABLE:
-            # Load Dots.OCR model
-            dots_ocr_model = DotsOCR()
-            logger.info("Dots.OCR model loaded successfully")
-        else:
-            logger.warning("Dots.OCR not available - using mock implementation")
-            dots_ocr_model = "mock"
-    except Exception as e:
-        logger.error(f"Failed to load Dots.OCR model: {e}")
-        # Don't raise - allow mock mode for development
-        dots_ocr_model = "mock"
-    yield
-    logger.info("Shutting down Dots.OCR endpoint...")
-app = FastAPI(
-    title="KYB Dots.OCR Text Extraction",
-    description="Dots.OCR for identity document text extraction with ROI support",
-    version="1.0.0",
-    lifespan=lifespan
-)
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy", "version": "1.0.0"}
-@app.post("/v1/id/ocr", response_model=OCRResponse)
-async def extract_text(
-    file: UploadFile = File(..., description="Image file to process"),
-    roi: Optional[str] = Form(None, description="ROI coordinates as JSON string")
-):
-    """Extract text from identity document image."""
-    if dots_ocr_model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-    start_time = time.time()
-    request_id = str(uuid.uuid4())
-    try:
-        # Read and validate image
-        image_data = await file.read()
-        image = Image.open(io.BytesIO(image_data))
-        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-        # Parse ROI if provided
-        roi_bbox = None
-        if roi:
-            try:
-                roi_data = json.loads(roi)
-                roi_bbox = BoundingBox(**roi_data)
-                # Crop image to ROI
-                image_cv = crop_image_by_roi(image_cv, roi_bbox)
-            except Exception as e:
-                logger.warning(f"Invalid ROI provided: {e}")
-        # Run OCR
-        if DOTS_OCR_AVAILABLE and dots_ocr_model != "mock":
-            # Use real Dots.OCR model
-            ocr_results = dots_ocr_model(image_cv)
-            ocr_text = " ".join([result.text for result in ocr_results])
-        else:
-            # Mock implementation for development
-            ocr_text = "MOCK OCR TEXT - Document Number: NLD123456789 Surname: MULDER Given Names: THOMAS"
-            logger.info("Using mock OCR implementation")
-        # Extract structured fields
-        extracted_fields = FieldExtractor.extract_fields(ocr_text)
-        # Extract MRZ data
-        mrz_data = FieldExtractor.extract_mrz(ocr_text)
-        # Create detection
-        detection = OCRDetection(
-            mrz_data=mrz_data,
-            extracted_fields=extracted_fields
-        )
-        processing_time = time.time() - start_time
-        return OCRResponse(
-            request_id=request_id,
-            media_type="image",
-            processing_time=processing_time,
-            detections=[detection]
-        )
-    except Exception as e:
-        logger.error(f"OCR extraction failed: {e}")
-        raise HTTPException(status_code=500, detail=f"OCR extraction failed: {str(e)}")
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+"""API models for Dots.OCR text extraction service.
+This module defines the data structures used for API requests,
+responses, and internal data processing.
 """
 from typing import List, Optional, Dict, Any
 from pydantic import BaseModel, Field
 class BoundingBox(BaseModel):
     source: str = Field(..., description="Extraction source (e.g., 'ocr')")
+class IdCardFields(BaseModel):
+    """Structured fields extracted from identity documents."""
+    document_number: Optional[ExtractedField] = Field(None, description="Document number/ID")
+    document_type: Optional[ExtractedField] = Field(None, description="Type of document")
+    issuing_country: Optional[ExtractedField] = Field(None, description="Issuing country code")
+    issuing_authority: Optional[ExtractedField] = Field(None, description="Issuing authority")
+    # Personal Information
+    surname: Optional[ExtractedField] = Field(None, description="Family name/surname")
+    given_names: Optional[ExtractedField] = Field(None, description="Given names")
+    nationality: Optional[ExtractedField] = Field(None, description="Nationality code")
+    date_of_birth: Optional[ExtractedField] = Field(None, description="Date of birth")
+    gender: Optional[ExtractedField] = Field(None, description="Gender")
+    place_of_birth: Optional[ExtractedField] = Field(None, description="Place of birth")
+    # Validity Information
+    date_of_issue: Optional[ExtractedField] = Field(None, description="Date of issue")
+    date_of_expiry: Optional[ExtractedField] = Field(None, description="Date of expiry")
+    personal_number: Optional[ExtractedField] = Field(None, description="Personal number")
+    # Additional fields for specific document types
+    optional_data_1: Optional[ExtractedField] = Field(None, description="Optional data field 1")
+    optional_data_2: Optional[ExtractedField] = Field(None, description="Optional data field 2")
 class ExtractedFields(BaseModel):
     """All extracted fields from identity document."""
     document_number: Optional[ExtractedField] = None
 class MRZData(BaseModel):
     """Machine Readable Zone data."""
+    # Primary canonical fields
     document_type: Optional[str] = Field(None, description="MRZ document type (TD1|TD2|TD3)")
     issuing_country: Optional[str] = Field(None, description="Issuing country code")
     surname: Optional[str] = Field(None, description="Surname from MRZ")
     raw_mrz: Optional[str] = Field(None, description="Raw MRZ text")
     confidence: float = Field(0.0, ge=0.0, le=1.0, description="MRZ extraction confidence")
+    # Backwards compatibility fields (some older code/tests expect these names)
+    # These duplicate information from the canonical fields above.
+    format_type: Optional[str] = Field(None, description="Alias of document_type for backward compatibility")
+    raw_text: Optional[str] = Field(None, description="Alias of raw_mrz for backward compatibility")
 class OCRDetection(BaseModel):
     """Single OCR detection result."""
     media_type: str = Field(..., description="Media type processed")
     processing_time: float = Field(..., description="Processing time in seconds")
     detections: List[OCRDetection] = Field(..., description="List of OCR detections")

src/kybtech_dots_ocr/app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""HF Dots.OCR Text Extraction Endpoint
+This FastAPI application provides a Hugging Face Space endpoint for Dots.OCR
+text extraction with ROI support and standardized field extraction schema.
+"""
+import logging
+import os
+import time
+import uuid
+import json
+import re
+from typing import List, Optional, Dict, Any
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+# Import local modules
+from .api_models import BoundingBox, ExtractedField, ExtractedFields, MRZData, OCRDetection, OCRResponse
+from .enhanced_field_extraction import EnhancedFieldExtractor
+from .model_loader import load_model, extract_text, is_model_loaded, get_model_info
+from .preprocessing import process_document, validate_file_size, get_document_info
+from .response_builder import build_ocr_response, build_error_response
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global model state
+model_loaded = False
+# FieldExtractor is now imported from the shared module
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager for model loading."""
+    global model_loaded
+    # Allow tests and lightweight environments to skip model loading
+    # Set DOTS_OCR_SKIP_MODEL_LOAD=1 to bypass heavy downloads during tests/CI
+    skip_model_load = os.getenv("DOTS_OCR_SKIP_MODEL_LOAD", "0") == "1"
+    logger.info("Loading Dots.OCR model...")
+    try:
+        if skip_model_load:
+            # Explicitly skip model loading for fast startup in tests/CI
+            model_loaded = False
+            logger.warning("DOTS_OCR_SKIP_MODEL_LOAD=1 set - skipping model load (mock mode)")
+        else:
+            # Load the model using the new model loader
+            load_model()
+            model_loaded = True
+            logger.info("Dots.OCR model loaded successfully")
+            # Log model information
+            model_info = get_model_info()
+            logger.info(f"Model info: {model_info}")
+    except Exception as e:
+        logger.error(f"Failed to load Dots.OCR model: {e}")
+        # Don't raise - allow mock mode for development
+        model_loaded = False
+        logger.warning("Model loading failed - using mock implementation")
+    yield
+    logger.info("Shutting down Dots.OCR endpoint...")
+app = FastAPI(
+    title="KYB Dots.OCR Text Extraction",
+    description="Dots.OCR for identity document text extraction with ROI support",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    global model_loaded
+    status = "healthy" if model_loaded else "degraded"
+    model_info = get_model_info() if model_loaded else None
+    return {
+        "status": status,
+        "version": "1.0.0",
+        "model_loaded": model_loaded,
+        "model_info": model_info
+    }
+@app.post("/v1/id/ocr", response_model=OCRResponse)
+async def extract_text_endpoint(
+    file: UploadFile = File(..., description="Image or PDF file to process"),
+    roi: Optional[str] = Form(None, description="ROI coordinates as JSON string")
+):
+    """Extract text from identity document image or PDF."""
+    global model_loaded
+    # Allow mock mode when model isn't loaded to support tests/CI and dev flows
+    allow_mock = os.getenv("DOTS_OCR_ALLOW_MOCK", "1") == "1"
+    is_mock_mode = (not model_loaded) and allow_mock
+    if not model_loaded and not allow_mock:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    start_time = time.time()
+    request_id = str(uuid.uuid4())
+    try:
+        # Read file data
+        file_data = await file.read()
+        # Validate file size
+        if not validate_file_size(file_data):
+            raise HTTPException(status_code=413, detail="File size exceeds limit")
+        # Get document information
+        doc_info = get_document_info(file_data)
+        logger.info(f"Processing document: {doc_info}")
+        # Parse ROI if provided
+        roi_coords = None
+        if roi:
+            try:
+                roi_data = json.loads(roi)
+                roi_bbox = BoundingBox(**roi_data)
+                roi_coords = (roi_bbox.x1, roi_bbox.y1, roi_bbox.x2, roi_bbox.y2)
+                logger.info(f"Using ROI: {roi_coords}")
+            except Exception as e:
+                logger.warning(f"Invalid ROI provided: {e}")
+                raise HTTPException(status_code=400, detail=f"Invalid ROI format: {e}")
+        # Process document (PDF to images or single image)
+        try:
+            processed_images = process_document(file_data, roi_coords)
+            logger.info(f"Processed {len(processed_images)} images from document")
+        except Exception as e:
+            logger.error(f"Document processing failed: {e}")
+            raise HTTPException(status_code=400, detail=f"Document processing failed: {e}")
+        # Process each image and extract text
+        ocr_texts = []
+        page_metadata = []
+        for i, image in enumerate(processed_images):
+            try:
+                # Extract text using the loaded model, or produce mock output in mock mode
+                if is_mock_mode:
+                    # In mock mode, we skip model inference and return empty text
+                    ocr_text = ""
+                else:
+                    ocr_text = extract_text(image)
+                logger.info(f"Page {i + 1} - Extracted text length: {len(ocr_text)} characters")
+                ocr_texts.append(ocr_text)
+                # Collect page metadata
+                page_meta = {
+                    "page_index": i,
+                    "image_size": image.size,
+                    "text_length": len(ocr_text),
+                    "processing_successful": True
+                }
+                page_metadata.append(page_meta)
+            except Exception as e:
+                logger.error(f"Text extraction failed for page {i + 1}: {e}")
+                # Add empty text for failed page
+                ocr_texts.append("")
+                page_meta = {
+                    "page_index": i,
+                    "image_size": image.size if hasattr(image, 'size') else (0, 0),
+                    "text_length": 0,
+                    "processing_successful": False,
+                    "error": str(e)
+                }
+                page_metadata.append(page_meta)
+        # Determine media type for response
+        media_type = "pdf" if doc_info["is_pdf"] else "image"
+        processing_time = time.time() - start_time
+        # Build response using the response builder
+        return build_ocr_response(
+            request_id=request_id,
+            media_type=media_type,
+            processing_time=processing_time,
+            ocr_texts=ocr_texts,
+            page_metadata=page_metadata
+        )
+    except HTTPException:
+        # Re-raise HTTP exceptions as-is
+        raise
+    except Exception as e:
+        logger.error(f"OCR extraction failed: {e}")
+        processing_time = time.time() - start_time
+        error_response = build_error_response(
+            request_id=request_id,
+            error_message=f"OCR extraction failed: {str(e)}",
+            processing_time=processing_time
+        )
+        raise HTTPException(status_code=500, detail=error_response.dict())
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

src/kybtech_dots_ocr/enhanced_field_extraction.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""Enhanced field extraction utilities for Dots.OCR text processing.
+This module provides improved field extraction and mapping from OCR results
+to structured KYB field formats with better confidence scoring and validation.
+"""
+import re
+import logging
+from typing import Optional, Dict, List, Tuple, Any
+from datetime import datetime
+from .api_models import ExtractedField, IdCardFields, MRZData
+# Configure logging
+logger = logging.getLogger(__name__)
+class EnhancedFieldExtractor:
+    """Enhanced field extraction with improved confidence scoring and validation."""
+    # Enhanced field mapping patterns with confidence scoring
+    FIELD_PATTERNS = {
+        "document_number": [
+            (r"documentnummer[:\s]*([A-Z0-9]{6,15})", 0.9),  # Dutch format
+            (r"document\s*number[:\s]*([A-Z0-9]{6,15})", 0.85),  # English format
+            (r"nr[:\s]*([A-Z0-9]{6,15})", 0.7),  # Abbreviated format
+            (r"ID[:\s]*([A-Z0-9]{6,15})", 0.8),  # ID format
+            (r"([A-Z]{3}\d{9})", 0.75),  # Passport format (3 letters + 9 digits)
+        ],
+        "surname": [
+            # Anchor to line and capture value up to newline to avoid spilling into next label
+            (r"^\s*achternaam[:\s]*([^\r\n]+)", 0.95),  # Dutch format (line-anchored)
+            (r"^\s*surname[:\s]*([^\r\n]+)", 0.9),  # English format (line-anchored)
+            (r"^\s*family\s*name[:\s]*([^\r\n]+)", 0.85),  # Full English
+            (r"^\s*last\s*name[:\s]*([^\r\n]+)", 0.85),  # Alternative English
+        ],
+        "given_names": [
+            (r"^\s*voornamen[:\s]*([^\r\n]+)", 0.95),  # Dutch format (line-anchored)
+            (r"^\s*given\s*names[:\s]*([^\r\n]+)", 0.9),  # English format (line-anchored)
+            (r"^\s*first\s*name[:\s]*([^\r\n]+)", 0.85),  # First name only
+            (r"^\s*voorletters[:\s]*([^\r\n]+)", 0.75),  # Dutch initials
+        ],
+        "nationality": [
+            (r"nationaliteit[:\s]*([A-Z]{3})", 0.9),  # Dutch format (3-letter code)
+            (r"nationality[:\s]*([A-Z]{3})", 0.85),  # English format
+            (r"nationality[:\s]*([A-Za-z\s]{3,20})", 0.7),  # Full country name
+        ],
+        "date_of_birth": [
+            (r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
+            (r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
+            (r"(\d{2}[./-]\d{2}[./-]\d{4})", 0.6),  # Generic date pattern
+        ],
+        "gender": [
+            (r"geslacht[:\s]*([MF])", 0.9),  # Dutch format
+            (r"gender[:\s]*([MF])", 0.85),  # English format
+            (r"sex[:\s]*([MF])", 0.8),  # Alternative English
+            (r"geslacht[:\s]*(man|vrouw)", 0.7),  # Dutch full words
+            (r"gender[:\s]*(male|female)", 0.7),  # English full words
+        ],
+        "place_of_birth": [
+            (r"geboorteplaats[:\s]*([A-Za-z\s]{2,30})", 0.9),  # Dutch format
+            (r"place\s*of\s*birth[:\s]*([A-Za-z\s]{2,30})", 0.85),  # English format
+            (r"born\s*in[:\s]*([A-Za-z\s]{2,30})", 0.8),  # Short English
+        ],
+        "date_of_issue": [
+            (r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
+            (r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
+        ],
+        "date_of_expiry": [
+            (r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.9),  # Dutch format
+            (r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.85),  # English format
+            (r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Short English
+            (r"valid\s*until[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", 0.8),  # Alternative English
+        ],
+        "personal_number": [
+            (r"persoonsnummer[:\s]*(\d{9})", 0.9),  # Dutch format
+            (r"personal\s*number[:\s]*(\d{9})", 0.85),  # English format
+            (r"bsn[:\s]*(\d{9})", 0.9),  # Dutch BSN
+            (r"social\s*security[:\s]*(\d{9})", 0.8),  # SSN format
+        ],
+        "document_type": [
+            (r"document\s*type[:\s]*([A-Za-z\s]{3,20})", 0.8),  # English format
+            (r"soort\s*document[:\s]*([A-Za-z\s]{3,20})", 0.9),  # Dutch format
+            (r"(passport|paspoort)", 0.9),  # Passport
+            (r"(identity\s*card|identiteitskaart)", 0.9),  # ID card
+            (r"(driving\s*license|rijbewijs)", 0.9),  # Driving license
+        ],
+        "issuing_country": [
+            (r"issuing\s*country[:\s]*([A-Z]{3})", 0.85),  # English format
+            (r"uitgevende\s*land[:\s]*([A-Z]{3})", 0.9),  # Dutch format
+            (r"country[:\s]*([A-Z]{3})", 0.7),  # Short format
+        ],
+        "issuing_authority": [
+            (r"issuing\s*authority[:\s]*([A-Za-z\s]{3,30})", 0.8),  # English format
+            (r"uitgevende\s*autoriteit[:\s]*([A-Za-z\s]{3,30})", 0.9),  # Dutch format
+            (r"authority[:\s]*([A-Za-z\s]{3,30})", 0.7),  # Short format
+        ]
+    }
+    # MRZ patterns with confidence scoring
+    MRZ_PATTERNS = [
+        # Strict formats first, allowing leading/trailing whitespace per line
+        (r"^\s*((?:[A-Z0-9<]{44})\s*\n\s*(?:[A-Z0-9<]{44}))\s*$", 0.95),  # TD3: Passport (2 x 44)
+        (r"^\s*((?:[A-Z0-9<]{36})\s*\n\s*(?:[A-Z0-9<]{36}))\s*$", 0.9),   # TD2: ID card (2 x 36)
+        (r"^\s*((?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30})\s*\n\s*(?:[A-Z0-9<]{30}))\s*$", 0.85),  # TD1: (3 x 30)
+        # Fallback generic: a line starting with P< followed by another MRZ-like line
+        (r"(P<[^\r\n]+\n[^\r\n]+)", 0.85),
+    ]
+    @classmethod
+    def extract_fields(cls, ocr_text: str) -> IdCardFields:
+        """Extract structured fields from OCR text with enhanced confidence scoring.
+        Args:
+            ocr_text: Raw OCR text from document processing
+        Returns:
+            IdCardFields object with extracted field data
+        """
+        logger.info(f"Extracting fields from text of length: {len(ocr_text)}")
+        fields = {}
+        extraction_stats = {"total_patterns": 0, "matches_found": 0}
+        for field_name, patterns in cls.FIELD_PATTERNS.items():
+            value = None
+            confidence = 0.0
+            best_pattern = None
+            for pattern, base_confidence in patterns:
+                extraction_stats["total_patterns"] += 1
+                match = re.search(pattern, ocr_text, re.IGNORECASE | re.MULTILINE)
+                if match:
+                    candidate_value = match.group(1).strip()
+                    # Validate the extracted value
+                    if cls._validate_field_value(field_name, candidate_value):
+                        value = candidate_value
+                        confidence = base_confidence
+                        best_pattern = pattern
+                        extraction_stats["matches_found"] += 1
+                        logger.debug(f"Found {field_name}: '{value}' (confidence: {confidence:.2f})")
+                        break
+            if value:
+                # Apply additional confidence adjustments
+                confidence = cls._adjust_confidence(field_name, value, confidence, ocr_text)
+                fields[field_name] = ExtractedField(
+                    field_name=field_name,
+                    value=value,
+                    confidence=confidence,
+                    source="ocr"
+                )
+        logger.info(f"Field extraction complete: {extraction_stats['matches_found']}/{extraction_stats['total_patterns']} patterns matched")
+        return IdCardFields(**fields)
+    @classmethod
+    def _validate_field_value(cls, field_name: str, value: str) -> bool:
+        """Validate extracted field value based on field type.
+        Args:
+            field_name: Name of the field
+            value: Extracted value to validate
+        Returns:
+            True if value is valid
+        """
+        if not value or len(value.strip()) == 0:
+            return False
+        # Field-specific validation
+        if field_name == "document_number":
+            return len(value) >= 6 and len(value) <= 15
+        elif field_name in ["surname", "given_names", "place_of_birth"]:
+            return len(value) >= 2 and len(value) <= 50
+        elif field_name == "nationality":
+            return len(value) == 3 and value.isalpha()
+        elif field_name in ["date_of_birth", "date_of_issue", "date_of_expiry"]:
+            return cls._validate_date_format(value)
+        elif field_name == "gender":
+            return value.upper() in ["M", "F", "MALE", "FEMALE", "MAN", "VROUW"]
+        elif field_name == "personal_number":
+            return len(value) == 9 and value.isdigit()
+        elif field_name == "issuing_country":
+            return len(value) == 3 and value.isalpha()
+        return True
+    @classmethod
+    def _validate_date_format(cls, date_str: str) -> bool:
+        """Validate date format and basic date logic.
+        Args:
+            date_str: Date string to validate
+        Returns:
+            True if date format is valid
+        """
+        try:
+            # Try different date separators
+            for sep in [".", "/", "-"]:
+                if sep in date_str:
+                    parts = date_str.split(sep)
+                    if len(parts) == 3:
+                        day, month, year = parts
+                        # Basic validation
+                        if (1 <= int(day) <= 31 and
+                            1 <= int(month) <= 12 and
+                            1900 <= int(year) <= 2100):
+                            return True
+        except (ValueError, IndexError):
+            pass
+        return False
+    @classmethod
+    def _adjust_confidence(cls, field_name: str, value: str, base_confidence: float, full_text: str) -> float:
+        """Adjust confidence based on additional factors.
+        Args:
+            field_name: Name of the field
+            value: Extracted value
+            base_confidence: Base confidence from pattern matching
+            full_text: Full OCR text for context
+        Returns:
+            Adjusted confidence score
+        """
+        confidence = base_confidence
+        # Length-based adjustments
+        if field_name in ["surname", "given_names"] and len(value) < 3:
+            confidence *= 0.8  # Shorter names are less reliable
+        # Context-based adjustments
+        if field_name == "document_number" and "passport" in full_text.lower():
+            confidence *= 1.1  # Higher confidence in passport context
+        # Multiple occurrence bonus
+        if value in full_text and full_text.count(value) > 1:
+            confidence *= 1.05  # Slight bonus for repeated values
+        # Ensure confidence stays within bounds
+        return min(max(confidence, 0.0), 1.0)
+    @classmethod
+    def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
+        """Extract MRZ data from OCR text with enhanced validation.
+        Args:
+            ocr_text: Raw OCR text from document processing
+        Returns:
+            MRZData object if MRZ detected, None otherwise
+        """
+        logger.info("Extracting MRZ data from OCR text")
+        best_match = None
+        best_confidence = 0.0
+        for pattern, base_confidence in cls.MRZ_PATTERNS:
+            match = re.search(pattern, ocr_text, re.MULTILINE)
+            if match:
+                raw_mrz = match.group(1)
+                # Validate MRZ format
+                if cls._validate_mrz_format(raw_mrz):
+                    confidence = base_confidence
+                    # Adjust confidence based on MRZ quality
+                    confidence = cls._adjust_mrz_confidence(raw_mrz, confidence)
+                    if confidence > best_confidence:
+                        best_match = raw_mrz
+                        best_confidence = confidence
+                        logger.debug(f"Found MRZ with confidence {confidence:.2f}")
+        if best_match:
+            # Parse MRZ to determine format type
+            format_type = cls._determine_mrz_format(best_match)
+            # Basic checksum validation
+            is_valid, errors = cls._validate_mrz_checksums(best_match, format_type)
+            logger.info(f"MRZ extracted: {format_type} format, valid: {is_valid}")
+            # Convert to the format expected by the API
+            from .api_models import MRZData as APIMRZData
+            # Populate both canonical and legacy alias fields for compatibility
+            return APIMRZData(
+                document_type=format_type,
+                format_type=format_type,  # legacy alias
+                issuing_country=None,  # would be parsed in full impl
+                surname=None,
+                given_names=None,
+                document_number=None,
+                nationality=None,
+                date_of_birth=None,
+                gender=None,
+                date_of_expiry=None,
+                personal_number=None,
+                raw_mrz=best_match,
+                raw_text=best_match,  # legacy alias
+                confidence=best_confidence,
+            )
+        logger.info("No MRZ data found in OCR text")
+        return None
+    @classmethod
+    def _validate_mrz_format(cls, mrz_text: str) -> bool:
+        """Validate basic MRZ format.
+        Args:
+            mrz_text: Raw MRZ text
+        Returns:
+            True if format is valid
+        """
+        lines = mrz_text.strip().split('\n')
+        if len(lines) < 2:
+            return False
+        # Normalize whitespace and validate character set only.
+        normalized_lines = [re.sub(r"\s+", "", line) for line in lines]
+        for line in normalized_lines:
+            if not re.match(r'^[A-Z0-9<]+$', line):
+                return False
+        return True
+    @classmethod
+    def _determine_mrz_format(cls, mrz_text: str) -> str:
+        """Determine MRZ format type.
+        Args:
+            mrz_text: Raw MRZ text
+        Returns:
+            Format type (TD1, TD2, TD3, etc.)
+        """
+        lines = mrz_text.strip().split('\n')
+        lines = [re.sub(r"\s+", "", line) for line in lines]
+        line_count = len(lines)
+        line_length = len(lines[0]) if lines else 0
+        # Heuristic mapping: prioritize semantics over exact lengths for robustness
+        if line_count == 2 and lines[0].startswith("P<"):
+            return "TD3"  # Passport format commonly starts with P<
+        if line_count == 2 and line_length == 36:
+            return "TD2"  # ID card format
+        if line_count == 3:
+            return "TD1"
+        return "UNKNOWN"
+    @classmethod
+    def _adjust_mrz_confidence(cls, mrz_text: str, base_confidence: float) -> float:
+        """Adjust MRZ confidence based on quality indicators.
+        Args:
+            mrz_text: Raw MRZ text
+            base_confidence: Base confidence from pattern matching
+        Returns:
+            Adjusted confidence
+        """
+        confidence = base_confidence
+        # Check line consistency
+        lines = mrz_text.strip().split('\n')
+        if len(set(len(line) for line in lines)) == 1:
+            confidence *= 1.05  # Bonus for consistent line lengths
+        return min(max(confidence, 0.0), 1.0)
+    @classmethod
+    def _validate_mrz_checksums(cls, mrz_text: str, format_type: str) -> Tuple[bool, List[str]]:
+        """Validate MRZ checksums (simplified implementation).
+        Args:
+            mrz_text: Raw MRZ text
+            format_type: MRZ format type
+        Returns:
+            Tuple of (is_valid, list_of_errors)
+        """
+        # This is a simplified implementation
+        # In production, you would implement full MRZ checksum validation
+        errors = []
+        # Basic validation - check for reasonable character distribution
+        if mrz_text.count('<') > len(mrz_text) * 0.3:
+            errors.append("Too many fill characters")
+        # For now, assume valid if basic format is correct
+        is_valid = len(errors) == 0
+        return is_valid, errors
+# Backward compatibility - use enhanced extractor as default
+class FieldExtractor(EnhancedFieldExtractor):
+    """Backward compatible field extractor using enhanced implementation."""
+    pass

field_extraction.py → src/kybtech_dots_ocr/field_extraction.py RENAMED Viewed

@@ -6,7 +6,7 @@ to structured KYB field formats.
 import re
 from typing import Optional
-from models import ExtractedField, IdCardFields, MRZData
 class FieldExtractor:

 import re
 from typing import Optional
+from .api_models import ExtractedField, IdCardFields, MRZData
 class FieldExtractor:

src/kybtech_dots_ocr/model_loader.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""Dots.OCR Model Loader
+This module handles downloading and loading the Dots.OCR model using Hugging Face's
+snapshot_download functionality. It provides device selection, dtype configuration,
+and model initialization with proper error handling.
+"""
+import os
+import logging
+import torch
+from typing import Optional, Tuple, Dict, Any
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+# Configure logging
+logger = logging.getLogger(__name__)
+# Environment variable configuration
+REPO_ID = os.getenv("DOTS_OCR_REPO_ID", "rednote-hilab/dots.ocr")
+LOCAL_DIR = os.getenv("DOTS_OCR_LOCAL_DIR", "/data/models/dots-ocr")
+DEVICE_CONFIG = os.getenv("DOTS_OCR_DEVICE", "auto")
+MAX_NEW_TOKENS = int(os.getenv("DOTS_OCR_MAX_NEW_TOKENS", "2048"))
+USE_FLASH_ATTENTION = os.getenv("DOTS_OCR_FLASH_ATTENTION", "1") == "1"
+MIN_PIXELS = int(os.getenv("DOTS_OCR_MIN_PIXELS", "3136"))  # 56x56
+MAX_PIXELS = int(os.getenv("DOTS_OCR_MAX_PIXELS", "11289600"))  # 3360x3360
+CUSTOM_PROMPT = os.getenv("DOTS_OCR_PROMPT")
+# Default transcription prompt for faithful text extraction
+DEFAULT_PROMPT = (
+    "Transcribe all visible text in the image in the original language. "
+    "Do not translate. Preserve natural reading order. Output plain text only."
+)
+class DotsOCRModelLoader:
+    """Handles Dots.OCR model downloading, loading, and inference."""
+    def __init__(self):
+        """Initialize the model loader."""
+        self.model = None
+        self.processor = None
+        self.device = None
+        self.dtype = None
+        self.local_dir = None
+        self.prompt = CUSTOM_PROMPT or DEFAULT_PROMPT
+    def _determine_device_and_dtype(self) -> Tuple[str, torch.dtype]:
+        """Determine the best device and dtype based on availability and configuration."""
+        if DEVICE_CONFIG == "cpu":
+            device = "cpu"
+            dtype = torch.float32
+        elif DEVICE_CONFIG == "cuda" and torch.cuda.is_available():
+            device = "cuda"
+            dtype = torch.bfloat16
+        elif DEVICE_CONFIG == "auto":
+            if torch.cuda.is_available():
+                device = "cuda"
+                dtype = torch.bfloat16
+            else:
+                device = "cpu"
+                dtype = torch.float32
+        else:
+            # Fallback to CPU if CUDA requested but not available
+            logger.warning(f"CUDA requested but not available, falling back to CPU")
+            device = "cpu"
+            dtype = torch.float32
+        logger.info(f"Selected device: {device}, dtype: {dtype}")
+        return device, dtype
+    def _download_model(self) -> str:
+        """Download the model using snapshot_download."""
+        logger.info(f"Downloading model from {REPO_ID} to {LOCAL_DIR}")
+        try:
+            # Ensure local directory exists
+            Path(LOCAL_DIR).mkdir(parents=True, exist_ok=True)
+            # Download model snapshot
+            local_path = snapshot_download(
+                repo_id=REPO_ID,
+                local_dir=LOCAL_DIR,
+                local_dir_use_symlinks=False,  # Avoid symlink issues in containers
+            )
+            logger.info(f"Model downloaded successfully to {local_path}")
+            return local_path
+        except Exception as e:
+            logger.error(f"Failed to download model: {e}")
+            raise RuntimeError(f"Model download failed: {e}")
+    def load_model(self) -> None:
+        """Load the Dots.OCR model and processor."""
+        try:
+            # Determine device and dtype
+            self.device, self.dtype = self._determine_device_and_dtype()
+            # Download model if not already present
+            self.local_dir = self._download_model()
+            # Load processor
+            logger.info("Loading processor...")
+            self.processor = AutoProcessor.from_pretrained(
+                self.local_dir,
+                trust_remote_code=True
+            )
+            # Load model with appropriate configuration
+            model_kwargs = {
+                "torch_dtype": self.dtype,
+                "trust_remote_code": True,
+            }
+            # Add device-specific configurations
+            if self.device == "cuda":
+                # Use flash attention if available and requested
+                if USE_FLASH_ATTENTION:
+                    try:
+                        model_kwargs["attn_implementation"] = "flash_attention_2"
+                        logger.info("Using flash attention 2")
+                    except Exception as e:
+                        logger.warning(f"Flash attention not available: {e}")
+                        logger.info("Falling back to standard attention")
+                # Use device_map for automatic GPU memory management
+                model_kwargs["device_map"] = "auto"
+            else:
+                # For CPU, don't use device_map
+                model_kwargs["device_map"] = None
+            logger.info("Loading model...")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.local_dir,
+                **model_kwargs
+            )
+            # Move model to device if not using device_map
+            if self.device == "cpu" or model_kwargs.get("device_map") is None:
+                self.model = self.model.to(self.device)
+            logger.info(f"Model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise RuntimeError(f"Model loading failed: {e}")
+    def _preprocess_image(self, image: Image.Image) -> Image.Image:
+        """Preprocess image to meet model requirements."""
+        # Convert to RGB if necessary
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Calculate current pixel count
+        width, height = image.size
+        current_pixels = width * height
+        # Resize if necessary to meet pixel requirements
+        if current_pixels < MIN_PIXELS:
+            # Scale up to meet minimum pixel requirement
+            scale_factor = (MIN_PIXELS / current_pixels) ** 0.5
+            new_width = int(width * scale_factor)
+            new_height = int(height * scale_factor)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Scaled up image from {width}x{height} to {new_width}x{new_height}")
+        elif current_pixels > MAX_PIXELS:
+            # Scale down to meet maximum pixel requirement
+            scale_factor = (MAX_PIXELS / current_pixels) ** 0.5
+            new_width = int(width * scale_factor)
+            new_height = int(height * scale_factor)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Scaled down image from {width}x{height} to {new_width}x{new_height}")
+        # Ensure dimensions are divisible by 28 (common requirement for vision models)
+        width, height = image.size
+        new_width = ((width + 27) // 28) * 28
+        new_height = ((height + 27) // 28) * 28
+        if new_width != width or new_height != height:
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Adjusted image dimensions to be divisible by 28: {new_width}x{new_height}")
+        return image
+    @torch.inference_mode()
+    def extract_text(self, image: Image.Image, prompt: Optional[str] = None) -> str:
+        """Extract text from an image using the loaded model."""
+        if self.model is None or self.processor is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+        try:
+            # Preprocess image
+            processed_image = self._preprocess_image(image)
+            # Use provided prompt or default
+            text_prompt = prompt or self.prompt
+            # Prepare messages for the model
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": processed_image},
+                    {"type": "text", "text": text_prompt},
+                ],
+            }]
+            # Apply chat template
+            text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            # Process vision information (required for some models)
+            try:
+                from qwen_vl_utils import process_vision_info
+                image_inputs, video_inputs = process_vision_info(messages)
+            except ImportError:
+                # Fallback if qwen_vl_utils not available
+                logger.warning("qwen_vl_utils not available, using basic processing")
+                image_inputs = [processed_image]
+                video_inputs = []
+            # Prepare inputs
+            inputs = self.processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt"
+            ).to(self.device)
+            # Generate text
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=MAX_NEW_TOKENS,
+                do_sample=False,
+                temperature=0.0,
+                pad_token_id=self.processor.tokenizer.eos_token_id
+            )
+            # Decode output
+            trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)]
+            decoded = self.processor.batch_decode(
+                trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            return decoded[0] if decoded else ""
+        except Exception as e:
+            logger.error(f"Text extraction failed: {e}")
+            raise RuntimeError(f"Text extraction failed: {e}")
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded and ready for inference."""
+        return self.model is not None and self.processor is not None
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model."""
+        return {
+            "device": self.device,
+            "dtype": str(self.dtype),
+            "local_dir": self.local_dir,
+            "repo_id": REPO_ID,
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "use_flash_attention": USE_FLASH_ATTENTION,
+            "prompt": self.prompt,
+            "is_loaded": self.is_loaded()
+        }
+# Global model instance
+_model_loader: Optional[DotsOCRModelLoader] = None
+def get_model_loader() -> DotsOCRModelLoader:
+    """Get the global model loader instance."""
+    global _model_loader
+    if _model_loader is None:
+        _model_loader = DotsOCRModelLoader()
+    return _model_loader
+def load_model() -> None:
+    """Load the Dots.OCR model."""
+    loader = get_model_loader()
+    loader.load_model()
+def extract_text(image: Image.Image, prompt: Optional[str] = None) -> str:
+    """Extract text from an image using the loaded model."""
+    loader = get_model_loader()
+    if not loader.is_loaded():
+        raise RuntimeError("Model not loaded. Call load_model() first.")
+    return loader.extract_text(image, prompt)
+def is_model_loaded() -> bool:
+    """Check if the model is loaded and ready."""
+    loader = get_model_loader()
+    return loader.is_loaded()
+def get_model_info() -> Dict[str, Any]:
+    """Get information about the loaded model."""
+    loader = get_model_loader()
+    return loader.get_model_info()

models.py → src/kybtech_dots_ocr/models.py RENAMED Viewed

File without changes

src/kybtech_dots_ocr/preprocessing.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""Image and PDF preprocessing utilities for Dots.OCR.
+This module handles PDF to image conversion, image preprocessing,
+and multi-page document processing for the Dots.OCR model.
+"""
+import os
+import logging
+from typing import List, Tuple, Optional, Union
+from pathlib import Path
+import io
+import fitz  # PyMuPDF
+import numpy as np
+from PIL import Image, ImageOps
+import cv2
+# Configure logging
+logger = logging.getLogger(__name__)
+# Environment variable configuration
+PDF_DPI = int(os.getenv("DOTS_OCR_PDF_DPI", "300"))
+PDF_MAX_PAGES = int(os.getenv("DOTS_OCR_PDF_MAX_PAGES", "10"))
+IMAGE_MAX_SIZE = int(os.getenv("DOTS_OCR_IMAGE_MAX_SIZE", "10")) * 1024 * 1024  # 10MB default
+class ImagePreprocessor:
+    """Handles image preprocessing for Dots.OCR model."""
+    def __init__(self, min_pixels: int = 3136, max_pixels: int = 11289600, divisor: int = 28):
+        """Initialize the image preprocessor.
+        Args:
+            min_pixels: Minimum pixel count for images
+            max_pixels: Maximum pixel count for images
+            divisor: Required divisor for image dimensions
+        """
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.divisor = divisor
+    def preprocess_image(self, image: Image.Image) -> Image.Image:
+        """Preprocess an image to meet model requirements.
+        Args:
+            image: Input PIL Image
+        Returns:
+            Preprocessed PIL Image
+        """
+        # Convert to RGB if necessary
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Auto-orient image based on EXIF data
+        image = ImageOps.exif_transpose(image)
+        # Calculate current pixel count
+        width, height = image.size
+        current_pixels = width * height
+        logger.info(f"Original image size: {width}x{height} ({current_pixels} pixels)")
+        # Resize if necessary to meet pixel requirements
+        if current_pixels < self.min_pixels:
+            # Scale up to meet minimum pixel requirement
+            scale_factor = (self.min_pixels / current_pixels) ** 0.5
+            new_width = int(width * scale_factor)
+            new_height = int(height * scale_factor)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Scaled up image to {new_width}x{new_height}")
+        elif current_pixels > self.max_pixels:
+            # Scale down to meet maximum pixel requirement
+            scale_factor = (self.max_pixels / current_pixels) ** 0.5
+            new_width = int(width * scale_factor)
+            new_height = int(height * scale_factor)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Scaled down image to {new_width}x{new_height}")
+        # Ensure dimensions are divisible by the required divisor
+        width, height = image.size
+        new_width = ((width + self.divisor - 1) // self.divisor) * self.divisor
+        new_height = ((height + self.divisor - 1) // self.divisor) * self.divisor
+        if new_width != width or new_height != height:
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            logger.info(f"Adjusted dimensions to be divisible by {self.divisor}: {new_width}x{new_height}")
+        return image
+    def crop_by_roi(self, image: Image.Image, roi: Tuple[float, float, float, float]) -> Image.Image:
+        """Crop image using ROI coordinates.
+        Args:
+            image: Input PIL Image
+            roi: ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
+        Returns:
+            Cropped PIL Image
+        """
+        x1, y1, x2, y2 = roi
+        width, height = image.size
+        # Convert normalized coordinates to pixel coordinates
+        x1_px = int(x1 * width)
+        y1_px = int(y1 * height)
+        x2_px = int(x2 * width)
+        y2_px = int(y2 * height)
+        # Ensure coordinates are within image bounds
+        x1_px = max(0, min(x1_px, width))
+        y1_px = max(0, min(y1_px, height))
+        x2_px = max(x1_px, min(x2_px, width))
+        y2_px = max(y1_px, min(y2_px, height))
+        # Crop the image
+        cropped = image.crop((x1_px, y1_px, x2_px, y2_px))
+        logger.info(f"Cropped image to {x2_px - x1_px}x{y2_px - y1_px} pixels")
+        return cropped
+class PDFProcessor:
+    """Handles PDF to image conversion and multi-page processing."""
+    def __init__(self, dpi: int = PDF_DPI, max_pages: int = PDF_MAX_PAGES):
+        """Initialize the PDF processor.
+        Args:
+            dpi: DPI for PDF to image conversion
+            max_pages: Maximum number of pages to process
+        """
+        self.dpi = dpi
+        self.max_pages = max_pages
+    def pdf_to_images(self, pdf_data: bytes) -> List[Image.Image]:
+        """Convert PDF to list of images.
+        Args:
+            pdf_data: PDF file data as bytes
+        Returns:
+            List of PIL Images, one per page
+        """
+        try:
+            # Open PDF from bytes
+            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+            images = []
+            # Limit number of pages to process
+            num_pages = min(len(pdf_document), self.max_pages)
+            logger.info(f"Processing {num_pages} pages from PDF")
+            for page_num in range(num_pages):
+                page = pdf_document[page_num]
+                # Convert page to image
+                mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)  # 72 is default DPI
+                pix = page.get_pixmap(matrix=mat)
+                # Convert to PIL Image
+                img_data = pix.tobytes("png")
+                image = Image.open(io.BytesIO(img_data))
+                images.append(image)
+                logger.info(f"Converted page {page_num + 1} to image: {image.size}")
+            pdf_document.close()
+            return images
+        except Exception as e:
+            logger.error(f"Failed to convert PDF to images: {e}")
+            raise RuntimeError(f"PDF conversion failed: {e}")
+    def is_pdf(self, file_data: bytes) -> bool:
+        """Check if file data is a PDF.
+        Args:
+            file_data: File data as bytes
+        Returns:
+            True if file is a PDF
+        """
+        return file_data.startswith(b'%PDF-')
+    def get_pdf_page_count(self, pdf_data: bytes) -> int:
+        """Get the number of pages in a PDF.
+        Args:
+            pdf_data: PDF file data as bytes
+        Returns:
+            Number of pages in the PDF
+        """
+        try:
+            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+            page_count = len(pdf_document)
+            pdf_document.close()
+            return page_count
+        except Exception as e:
+            logger.error(f"Failed to get PDF page count: {e}")
+            return 0
+class DocumentProcessor:
+    """Main document processing class that handles both images and PDFs."""
+    def __init__(self):
+        """Initialize the document processor."""
+        self.image_preprocessor = ImagePreprocessor()
+        self.pdf_processor = PDFProcessor()
+    def process_document(
+        self,
+        file_data: bytes,
+        roi: Optional[Tuple[float, float, float, float]] = None
+    ) -> List[Image.Image]:
+        """Process a document (image or PDF) and return preprocessed images.
+        Args:
+            file_data: Document file data as bytes
+            roi: Optional ROI coordinates as (x1, y1, x2, y2) normalized to [0, 1]
+        Returns:
+            List of preprocessed PIL Images
+        """
+        # Check if it's a PDF
+        if self.pdf_processor.is_pdf(file_data):
+            logger.info("Processing PDF document")
+            images = self.pdf_processor.pdf_to_images(file_data)
+        else:
+            # Process as image
+            logger.info("Processing image document")
+            try:
+                image = Image.open(io.BytesIO(file_data))
+                images = [image]
+            except Exception as e:
+                logger.error(f"Failed to open image: {e}")
+                raise RuntimeError(f"Image processing failed: {e}")
+        # Preprocess each image
+        processed_images = []
+        for i, image in enumerate(images):
+            try:
+                # Apply ROI cropping if provided
+                if roi is not None:
+                    image = self.image_preprocessor.crop_by_roi(image, roi)
+                # Preprocess image for model requirements
+                processed_image = self.image_preprocessor.preprocess_image(image)
+                processed_images.append(processed_image)
+                logger.info(f"Processed image {i + 1}: {processed_image.size}")
+            except Exception as e:
+                logger.error(f"Failed to preprocess image {i + 1}: {e}")
+                # Continue with other images even if one fails
+                continue
+        if not processed_images:
+            raise RuntimeError("No images could be processed from the document")
+        logger.info(f"Successfully processed {len(processed_images)} images")
+        return processed_images
+    def validate_file_size(self, file_data: bytes) -> bool:
+        """Validate that file size is within limits.
+        Args:
+            file_data: File data as bytes
+        Returns:
+            True if file size is acceptable
+        """
+        file_size = len(file_data)
+        if file_size > IMAGE_MAX_SIZE:
+            logger.warning(f"File size {file_size} exceeds limit {IMAGE_MAX_SIZE}")
+            return False
+        return True
+    def get_document_info(self, file_data: bytes) -> dict:
+        """Get information about the document.
+        Args:
+            file_data: Document file data as bytes
+        Returns:
+            Dictionary with document information
+        """
+        info = {
+            "file_size": len(file_data),
+            "is_pdf": self.pdf_processor.is_pdf(file_data),
+            "page_count": 1
+        }
+        if info["is_pdf"]:
+            info["page_count"] = self.pdf_processor.get_pdf_page_count(file_data)
+        return info
+# Global document processor instance
+_document_processor: Optional[DocumentProcessor] = None
+def get_document_processor() -> DocumentProcessor:
+    """Get the global document processor instance."""
+    global _document_processor
+    if _document_processor is None:
+        _document_processor = DocumentProcessor()
+    return _document_processor
+def process_document(
+    file_data: bytes,
+    roi: Optional[Tuple[float, float, float, float]] = None
+) -> List[Image.Image]:
+    """Process a document and return preprocessed images."""
+    processor = get_document_processor()
+    return processor.process_document(file_data, roi)
+def validate_file_size(file_data: bytes) -> bool:
+    """Validate that file size is within limits."""
+    processor = get_document_processor()
+    return processor.validate_file_size(file_data)
+def get_document_info(file_data: bytes) -> dict:
+    """Get information about the document."""
+    processor = get_document_processor()
+    return processor.get_document_info(file_data)

src/kybtech_dots_ocr/response_builder.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""Response builder for Dots.OCR API responses.
+This module handles the construction and validation of OCR API responses
+according to the specified schema with proper error handling and metadata.
+"""
+import logging
+import time
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+from .api_models import OCRResponse, OCRDetection, ExtractedFields, MRZData, ExtractedField
+from .enhanced_field_extraction import EnhancedFieldExtractor
+# Configure logging
+logger = logging.getLogger(__name__)
+class OCRResponseBuilder:
+    """Builds OCR API responses with proper validation and metadata."""
+    def __init__(self):
+        """Initialize the response builder."""
+        self.field_extractor = EnhancedFieldExtractor()
+    def build_response(
+        self,
+        request_id: str,
+        media_type: str,
+        processing_time: float,
+        ocr_texts: List[str],
+        page_metadata: Optional[List[Dict[str, Any]]] = None
+    ) -> OCRResponse:
+        """Build a complete OCR response from extracted texts.
+        Args:
+            request_id: Unique request identifier
+            media_type: Type of media processed ("image" or "pdf")
+            processing_time: Total processing time in seconds
+            ocr_texts: List of OCR text results (one per page)
+            page_metadata: Optional metadata for each page
+        Returns:
+            Complete OCRResponse object
+        """
+        logger.info(f"Building response for {len(ocr_texts)} pages")
+        detections = []
+        for i, ocr_text in enumerate(ocr_texts):
+            try:
+                # Extract fields and MRZ data
+                extracted_fields = self.field_extractor.extract_fields(ocr_text)
+                mrz_data = self.field_extractor.extract_mrz(ocr_text)
+                # Create detection for this page
+                detection = self._create_detection(extracted_fields, mrz_data, i, page_metadata)
+                detections.append(detection)
+                logger.info(f"Page {i + 1}: {len(extracted_fields.__dict__)} fields, MRZ: {mrz_data is not None}")
+            except Exception as e:
+                logger.error(f"Failed to process page {i + 1}: {e}")
+                # Create empty detection for failed page
+                detection = self._create_empty_detection(i)
+                detections.append(detection)
+        # Build final response
+        response = OCRResponse(
+            request_id=request_id,
+            media_type=media_type,
+            processing_time=processing_time,
+            detections=detections
+        )
+        # Validate response
+        self._validate_response(response)
+        logger.info(f"Response built successfully: {len(detections)} detections")
+        return response
+    def _create_detection(
+        self,
+        extracted_fields: ExtractedFields,
+        mrz_data: Optional[MRZData],
+        page_index: int,
+        page_metadata: Optional[List[Dict[str, Any]]] = None
+    ) -> OCRDetection:
+        """Create an OCR detection from extracted data.
+        Args:
+            extracted_fields: Extracted field data
+            mrz_data: MRZ data if available
+            page_index: Index of the page
+            page_metadata: Optional metadata for the page
+        Returns:
+            OCRDetection object
+        """
+        # Convert IdCardFields to ExtractedFields format expected by OCRDetection
+        converted_fields = self._convert_fields_format(extracted_fields)
+        # Enhance MRZ data if available
+        enhanced_mrz = self._enhance_mrz_data(mrz_data, page_index, page_metadata)
+        return OCRDetection(
+            mrz_data=enhanced_mrz,
+            extracted_fields=converted_fields
+        )
+    def _convert_fields_format(self, id_card_fields) -> ExtractedFields:
+        """Convert IdCardFields to the format expected by OCRDetection.
+        Args:
+            id_card_fields: IdCardFields object
+        Returns:
+            ExtractedFields object
+        """
+        # Convert IdCardFields to ExtractedFields by mapping the fields
+        field_dict = {}
+        for field_name, field_value in id_card_fields.__dict__.items():
+            if field_value is not None:
+                # Convert ExtractedField to dict for Pydantic validation
+                field_dict[field_name] = field_value.dict() if hasattr(field_value, 'dict') else field_value
+        return ExtractedFields(**field_dict)
+    def _enhance_mrz_data(
+        self,
+        mrz_data: Optional[MRZData],
+        page_index: int,
+        page_metadata: Optional[List[Dict[str, Any]]] = None
+    ) -> Optional[MRZData]:
+        """Enhance MRZ data with additional context if available.
+        Args:
+            mrz_data: Original MRZ data
+            page_index: Index of the page
+            page_metadata: Optional metadata for the page
+        Returns:
+            Enhanced MRZ data or None
+        """
+        if mrz_data is None:
+            return None
+        # Add page context if available
+        if page_metadata and page_index < len(page_metadata):
+            metadata = page_metadata[page_index]
+            # Could add page-specific confidence adjustments here
+            pass
+        return mrz_data
+    def _create_empty_detection(self, page_index: int) -> OCRDetection:
+        """Create an empty detection for failed pages.
+        Args:
+            page_index: Index of the failed page
+        Returns:
+            Empty OCRDetection object
+        """
+        logger.warning(f"Creating empty detection for failed page {page_index + 1}")
+        return OCRDetection(
+            mrz_data=None,
+            extracted_fields=ExtractedFields()
+        )
+    def _validate_response(self, response: OCRResponse) -> None:
+        """Validate the response structure and data.
+        Args:
+            response: OCRResponse to validate
+        Raises:
+            ValueError: If response validation fails
+        """
+        # Validate request_id
+        if not response.request_id or len(response.request_id) == 0:
+            raise ValueError("Request ID cannot be empty")
+        # Validate media_type
+        if response.media_type not in ["image", "pdf"]:
+            raise ValueError(f"Invalid media_type: {response.media_type}")
+        # Validate processing_time
+        if response.processing_time < 0:
+            raise ValueError("Processing time cannot be negative")
+        # Validate detections
+        if not response.detections:
+            logger.warning("Response has no detections")
+        # Validate each detection
+        for i, detection in enumerate(response.detections):
+            self._validate_detection(detection, i)
+        logger.debug("Response validation passed")
+    def _validate_detection(self, detection: OCRDetection, index: int) -> None:
+        """Validate a single detection.
+        Args:
+            detection: OCRDetection to validate
+            index: Index of the detection
+        Raises:
+            ValueError: If detection validation fails
+        """
+        # Validate MRZ data if present
+        if detection.mrz_data:
+            self._validate_mrz_data(detection.mrz_data, index)
+        # Validate extracted fields
+        if detection.extracted_fields:
+            self._validate_extracted_fields(detection.extracted_fields, index)
+    def _validate_mrz_data(self, mrz_data: MRZData, index: int) -> None:
+        """Validate MRZ data.
+        Args:
+            mrz_data: MRZ data to validate
+            index: Index of the detection
+        Raises:
+            ValueError: If MRZ data validation fails
+        """
+        # Support both canonical and legacy attribute names
+        raw_text_value = getattr(mrz_data, "raw_text", None) or getattr(mrz_data, "raw_mrz", None)
+        if not raw_text_value:
+            raise ValueError(f"MRZ raw text cannot be empty for detection {index}")
+        format_type_value = getattr(mrz_data, "format_type", None) or getattr(mrz_data, "document_type", None)
+        if not format_type_value:
+            raise ValueError(f"MRZ format type cannot be empty for detection {index}")
+        if not (0.0 <= mrz_data.confidence <= 1.0):
+            raise ValueError(f"MRZ confidence must be between 0.0 and 1.0 for detection {index}")
+    def _validate_extracted_fields(self, fields: ExtractedFields, index: int) -> None:
+        """Validate extracted fields.
+        Args:
+            fields: Extracted fields to validate
+            index: Index of the detection
+        Raises:
+            ValueError: If fields validation fails
+        """
+        # Validate each field if present
+        for field_name, field_value in fields.__dict__.items():
+            if field_value is not None:
+                if not isinstance(field_value, ExtractedField):
+                    raise ValueError(f"Field {field_name} must be ExtractedField instance for detection {index}")
+                # Validate field content
+                if not (0.0 <= field_value.confidence <= 1.0):
+                    raise ValueError(f"Field {field_name} confidence must be between 0.0 and 1.0 for detection {index}")
+    def build_error_response(
+        self,
+        request_id: str,
+        error_message: str,
+        processing_time: float = 0.0
+    ) -> OCRResponse:
+        """Build an error response.
+        Args:
+            request_id: Unique request identifier
+            error_message: Error message
+            processing_time: Processing time before error
+        Returns:
+            Error OCRResponse object
+        """
+        logger.error(f"Building error response: {error_message}")
+        return OCRResponse(
+            request_id=request_id,
+            media_type="image",  # Default media type
+            processing_time=processing_time,
+            detections=[]  # Empty detections for error
+        )
+# Global response builder instance
+_response_builder: Optional[OCRResponseBuilder] = None
+def get_response_builder() -> OCRResponseBuilder:
+    """Get the global response builder instance."""
+    global _response_builder
+    if _response_builder is None:
+        _response_builder = OCRResponseBuilder()
+    return _response_builder
+def build_ocr_response(
+    request_id: str,
+    media_type: str,
+    processing_time: float,
+    ocr_texts: List[str],
+    page_metadata: Optional[List[Dict[str, Any]]] = None
+) -> OCRResponse:
+    """Build a complete OCR response from extracted texts."""
+    builder = get_response_builder()
+    return builder.build_response(request_id, media_type, processing_time, ocr_texts, page_metadata)
+def build_error_response(
+    request_id: str,
+    error_message: str,
+    processing_time: float = 0.0
+) -> OCRResponse:
+    """Build an error response."""
+    builder = get_response_builder()
+    return builder.build_error_response(request_id, error_message, processing_time)

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Test package for KYB Tech Dots.OCR."""

tests/test_app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Tests for the main FastAPI application."""
+import pytest
+from fastapi.testclient import TestClient
+from src.kybtech_dots_ocr.app import app
+client = TestClient(app)
+def test_health_check():
+    """Test the health check endpoint."""
+    response = client.get("/health")
+    assert response.status_code == 200
+    data = response.json()
+    assert "status" in data
+    assert "version" in data
+def test_ocr_endpoint_missing_file():
+    """Test OCR endpoint with missing file."""
+    response = client.post("/v1/id/ocr")
+    assert response.status_code == 422  # Validation error
+def test_ocr_endpoint_invalid_file():
+    """Test OCR endpoint with invalid file."""
+    files = {"file": ("test.txt", b"not an image", "text/plain")}
+    response = client.post("/v1/id/ocr", files=files)
+    # Should handle gracefully
+    assert response.status_code in [400, 422, 500]
+@pytest.mark.skip(reason="Requires model to be loaded")
+def test_ocr_endpoint_with_image():
+    """Test OCR endpoint with actual image (requires model)."""
+    # This test would require the model to be loaded
+    # and actual image data
+    pass

tests/test_field_extraction.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Tests for field extraction functionality."""
+import pytest
+from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor
+class TestEnhancedFieldExtractor:
+    """Test cases for EnhancedFieldExtractor."""
+    def test_extract_fields_dutch_id(self):
+        """Test field extraction with Dutch ID card text."""
+        extractor = EnhancedFieldExtractor()
+        text = """
+        IDENTITEITSKAART
+        Documentnummer: NLD123456789
+        Achternaam: MULDER
+        Voornamen: THOMAS JAN
+        Nationaliteit: NLD
+        Geboortedatum: 15-03-1990
+        Geslacht: M
+        """
+        fields = extractor.extract_fields(text)
+        assert fields.document_number is not None
+        assert fields.document_number.value == "NLD123456789"
+        assert fields.surname is not None
+        assert fields.surname.value == "MULDER"
+        assert fields.given_names is not None
+        assert fields.given_names.value == "THOMAS JAN"
+    def test_extract_fields_english_id(self):
+        """Test field extraction with English ID card text."""
+        extractor = EnhancedFieldExtractor()
+        text = """
+        IDENTITY CARD
+        Document Number: NLD123456789
+        Surname: MULDER
+        Given Names: THOMAS JAN
+        Nationality: NLD
+        Date of Birth: 15-03-1990
+        Gender: M
+        """
+        fields = extractor.extract_fields(text)
+        assert fields.document_number is not None
+        assert fields.document_number.value == "NLD123456789"
+        assert fields.surname is not None
+        assert fields.surname.value == "MULDER"
+    def test_extract_mrz_data(self):
+        """Test MRZ data extraction."""
+        extractor = EnhancedFieldExtractor()
+        text = """
+        P<NLDMULDER<<THOMAS<<<<<<<<<<<<<<<<<<<<<<<<<
+        NLD123456789NLD9003151M300101123456789<<<<<<<<
+        """
+        mrz_data = extractor.extract_mrz(text)
+        assert mrz_data is not None
+        assert mrz_data.format_type == "TD3"
+        assert mrz_data.confidence > 0.8
+    def test_extract_fields_empty_text(self):
+        """Test field extraction with empty text."""
+        extractor = EnhancedFieldExtractor()
+        fields = extractor.extract_fields("")
+        # Should return empty fields
+        assert fields.document_number is None
+        assert fields.surname is None
+    def test_confidence_scoring(self):
+        """Test confidence scoring functionality."""
+        extractor = EnhancedFieldExtractor()
+        # High quality text
+        high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER"
+        fields_high = extractor.extract_fields(high_quality)
+        # Lower quality text
+        low_quality = "doc nr: NLD123"
+        fields_low = extractor.extract_fields(low_quality)
+        if fields_high.document_number and fields_low.document_number:
+            assert fields_high.document_number.confidence >= fields_low.document_number.confidence