Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Wasim commited on Aug 26

Commit

2e237ce

1 Parent(s): df67c09

Sync: robust vehicle parser + full project

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +5 -0
.gitattributes +8 -8
.github/FUNDING.yml +1 -0
.github/dependabot.yml +17 -0
.github/workflows/push_docker_image.yml +53 -0
.github/workflows/test.yml +49 -0
.gitignore +167 -0
Dockerfile +55 -0
LICENSE +201 -0
Makefile +78 -0
README.md +898 -28
app.py +83 -9
dev-requirements.txt +4 -0
docker-compose-gpu.yml +14 -0
docker-compose.yml +11 -0
extract_pdf_data.py +528 -33
extract_red_text.py +377 -72
fine_tuning_lightgbm_models.ipynb +961 -0
images/vgtexample1.png +3 -0
images/vgtexample2.png +3 -0
images/vgtexample3.png +3 -0
images/vgtexample4.png +3 -0
justfile +95 -0
master_key.py +2 -1
pyproject.toml +39 -0
requirements.txt +27 -0
space-pdf/README.md +910 -0
space-pdf/app.py +124 -0
space-pdf/extract_pdf_data.py +534 -0
space-pdf/extract_red_text.py +764 -0
space-pdf/master_key.py +372 -0
space-pdf/packages.txt +2 -0
space-pdf/requirements.txt +37 -0
space-pdf/update_docx_with_pdf.py +1470 -0
space-pdf/updated_word.py +1189 -0
src/adapters/__init__.py +0 -0
src/adapters/infrastructure/__init__.py +0 -0
src/adapters/infrastructure/format_conversion_service_adapter.py +13 -0
src/adapters/infrastructure/format_converters/__init__.py +0 -0
src/adapters/infrastructure/format_converters/convert_formula_to_latex.py +43 -0
src/adapters/infrastructure/format_converters/convert_table_to_html.py +33 -0
src/adapters/infrastructure/html_conversion_service_adapter.py +23 -0
src/adapters/infrastructure/markdown_conversion_service_adapter.py +23 -0
src/adapters/infrastructure/markup_conversion/ExtractedImage.py +6 -0
src/adapters/infrastructure/markup_conversion/Link.py +8 -0
src/adapters/infrastructure/markup_conversion/OutputFormat.py +6 -0
src/adapters/infrastructure/markup_conversion/__init__.py +0 -0
src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py +361 -0
src/adapters/infrastructure/ocr/__init__.py +0 -0
src/adapters/infrastructure/ocr/languages.py +174 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,5 @@

+/venv/
+/.venv/
+.git
+/detectron2/
+/images/

.gitattributes CHANGED Viewed

@@ -1,8 +1,8 @@
-# Handle Python code and text files
-*.py text eol=lf
-*.md text eol=lf
-*.txt text eol=lf
-# Handle binary files
-*.pdf binary
-*.docx binary

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ custom: ["https://huridocs.org/donate/"]

.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    open-pull-requests-limit: 5
+    labels:
+      - "dependencies"
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+  - package-ecosystem: "docker"
+    directory: "/"
+    schedule:
+      interval: "daily"

.github/workflows/push_docker_image.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: Create and publish Docker image
+on:
+  push:
+    tags:
+      - 'v*'
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: huridocs/pdf-document-layout-analysis
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: sudo apt-get install -y  just
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+      - name: Create folder models
+        run: mkdir -p models
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}

.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,49 @@

+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+name: Test
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+    - name: Install dependencies
+      run: sudo apt-get update; sudo apt-get install -y pdftohtml qpdf just
+    - name: Free up space
+      run: just free_up_space
+    - name: Install venv
+      run: just install_venv
+    - name: Lint with black
+      run: just check_format
+    - name: Start service
+      run: just start_detached
+    - name: Check API ready
+      uses: emilioschepis/wait-for-endpoint@v1.0.3
+      with:
+        url: http://localhost:5060
+        method: GET
+        expected-status: 200
+        timeout: 120000
+        interval: 500
+    - name: Test with unittest
+      run: just test

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+/models/
+/word_grids/
+/jsons/
+/model_output/
+/pdf_outputs/
+/detectron2/
+/ocr/

Dockerfile ADDED Viewed

	@@ -0,0 +1,55 @@

+FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+RUN apt-get update
+RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
+RUN apt-get install -y ocrmypdf
+RUN apt-get install -y tesseract-ocr-fra
+RUN apt-get install -y tesseract-ocr-spa
+RUN apt-get install -y tesseract-ocr-deu
+RUN apt-get install -y tesseract-ocr-ara
+RUN apt-get install -y tesseract-ocr-mya
+RUN apt-get install -y tesseract-ocr-hin
+RUN apt-get install -y tesseract-ocr-tam
+RUN apt-get install -y tesseract-ocr-tha
+RUN apt-get install -y tesseract-ocr-chi-sim
+RUN apt-get install -y tesseract-ocr-tur
+RUN apt-get install -y tesseract-ocr-ukr
+RUN apt-get install -y tesseract-ocr-ell
+RUN apt-get install -y tesseract-ocr-rus
+RUN apt-get install -y tesseract-ocr-kor
+RUN apt-get install -y tesseract-ocr-kor-vert
+RUN mkdir -p /app/src
+RUN mkdir -p /app/models
+RUN addgroup --system python && adduser --system --group python
+RUN chown -R python:python /app
+USER python
+ENV VIRTUAL_ENV=/app/.venv
+RUN python -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+COPY requirements.txt requirements.txt
+RUN uv pip install --upgrade pip
+RUN uv pip install -r requirements.txt
+WORKDIR /app
+RUN cd src; git clone https://github.com/facebookresearch/detectron2;
+RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop
+RUN uv pip install pycocotools==2.0.8
+COPY ./start.sh ./start.sh
+COPY ./src/. ./src
+COPY ./models/. ./models/
+RUN python src/download_models.py
+ENV PYTHONPATH "${PYTHONPATH}:/app/src"
+ENV TRANSFORMERS_VERBOSITY=error
+ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024-present HURIDOCS
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,78 @@

+HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
+install:
+	. .venv/bin/activate; pip install -Ur requirements.txt
+activate:
+	. .venv/bin/activate
+install_venv:
+	python3 -m venv .venv
+	. .venv/bin/activate; python -m pip install --upgrade pip
+	. .venv/bin/activate; python -m pip install -r dev-requirements.txt
+formatter:
+	. .venv/bin/activate; command black --line-length 125 .
+check_format:
+	. .venv/bin/activate; command black --line-length 125 . --check
+remove_docker_containers:
+	docker compose ps -q | xargs docker rm
+remove_docker_images:
+	docker compose config --images | xargs docker rmi
+start:
+ifeq ($(OS), Windows_NT)
+	if not exist models mkdir models
+else
+	mkdir -p ./models
+endif
+ifeq ($(HAS_GPU), 1)
+	@echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
+	docker compose -f docker-compose-gpu.yml up --build
+else
+	@echo "No NVIDIA GPU detected, using docker-compose.yml"
+	docker compose -f docker-compose.yml up --build
+endif
+start_no_gpu:
+	mkdir -p ./models
+	docker compose up --build
+stop:
+	docker compose stop
+test:
+	. .venv/bin/activate; command cd src; command python -m pytest
+free_up_space:
+	df -h
+	sudo rm -rf /usr/share/dotnet
+	sudo rm -rf /opt/ghc
+	sudo rm -rf "/usr/local/share/boost"
+	sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+	sudo apt-get remove -y '^llvm-.*' || true
+	sudo apt-get remove -y 'php.*' || true
+	sudo apt-get remove -y google-cloud-sdk hhvm google-chrome-stable firefox mono-devel || true
+	sudo apt-get autoremove -y
+	sudo apt-get clean
+	sudo rm -rf /usr/share/dotnet
+	sudo rm -rf /usr/local/lib/android
+	sudo rm -rf /opt/hostedtoolcache/CodeQL
+	sudo docker image prune --all --force
+	df -h
+start_detached:
+	mkdir -p ./models
+	docker compose up --build -d
+start_detached_gpu:
+	mkdir -p ./models
+	RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
+upgrade:
+	. .venv/bin/activate; pip-upgrade

README.md CHANGED Viewed

@@ -1,40 +1,910 @@
 ---
-title: Audit Report Generator
-emoji: 📝
-colorFrom: purple
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.38.2
-app_file: app.py
-pinned: false
 ---
-# NHVAS Audit Report Generator
-This tool automatically extracts relevant fields from an NHVAS PDF audit summary and populates a Word report template with the extracted data.
-## Features
-- Upload an NHVAS PDF report
-- Upload your Word `.docx` report template
-- Automatically fills red-text placeholders in the Word document
-- Supports 7 module combinations (Mass, Maintenance, Fatigue, and their permutations)
-- Download the completed report instantly
-## How to Use
-1. Upload your **PDF audit report**.
-2. Upload your **Word template (.docx)** with red-text placeholders.
-3. Click **Generate Report**.
-4. Download the updated Word document.
-## Tech Stack
-- Python 🐍
-- Gradio UI (via Hugging Face Spaces)
-- PyMuPDF (for PDF parsing)
-- python-docx (for Word file editing)
-## Author
-Built by Shami (Muhammad Ahtesham Ahmad)

+<h1 align="center">PDF Document Layout Analysis</h1>
+<p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
+<p align="center">
+  <img src="https://img.shields.io/badge/Python-3.10+-blue.svg" alt="Python Version">
+  <img src="https://img.shields.io/badge/FastAPI-0.111.1-green.svg" alt="FastAPI">
+  <img src="https://img.shields.io/badge/Docker-Ready-blue.svg" alt="Docker">
+  <img src="https://img.shields.io/badge/GPU-Supported-orange.svg" alt="GPU Support">
+</p>
+<div align="center">
+  <p><strong>Built with ❤️ by <a href="https://huridocs.org">HURIDOCS</a></strong></p>
+  <p>
+    <a href="https://github.com/huridocs/pdf-document-layout-analysis">⭐ Star us on GitHub</a> •
+    <a href="https://hub.docker.com/r/huridocs/pdf-document-layout-analysis">🐳 Pull from Docker Hub</a> •
+    <a href="https://huggingface.co/HURIDOCS/pdf-document-layout-analysis">🤗 View on Hugging Face</a>
+  </p>
+</div>
 ---
+## 🚀 Overview
+This project provides a powerful and flexible PDF analysis microservice built with **Clean Architecture** principles. The service enables OCR, segmentation, and classification of different parts of PDF pages, identifying elements such as texts, titles, pictures, tables, formulas, and more. Additionally, it determines the correct reading order of these identified elements and can convert PDFs to various formats including Markdown and HTML.
+### ✨ Key Features
+- 🔍 **Advanced PDF Layout Analysis** - Segment and classify PDF content with high accuracy
+- 🖼️ **Visual & Fast Models** - Choose between VGT (Vision Grid Transformer) for accuracy or LightGBM for speed
+- 📝 **Multi-format Output** - Export to JSON, Markdown, HTML, and visualize PDF segmentations
+- 🌐 **OCR Support** - 150+ language support with Tesseract OCR
+- 📊 **Table & Formula Extraction** - Extract tables as HTML and formulas as LaTeX
+- 🏗️ **Clean Architecture** - Modular, testable, and maintainable codebase
+- 🐳 **Docker-Ready** - Easy deployment with GPU support
+- ⚡ **RESTful API** - Comprehensive API with 10+ endpoints
+<table>
+  <tr>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample1.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample2.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample3.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample4.png"/>
+    </td>
+  </tr>
+</table>
+### 🔗 Project Links
+- **GitHub**: [pdf-document-layout-analysis](https://github.com/huridocs/pdf-document-layout-analysis)
+- **HuggingFace**: [pdf-document-layout-analysis](https://huggingface.co/HURIDOCS/pdf-document-layout-analysis)
+- **DockerHub**: [pdf-document-layout-analysis](https://hub.docker.com/r/huridocs/pdf-document-layout-analysis/)
 ---
+## 🚀 Quick Start
+### 1. Start the Service
+**With GPU support (recommended for better performance):**
+```bash
+make start
+```
+**Without GPU support:**
+```bash
+make start_no_gpu
+```
+The service will be available at `http://localhost:5060`
+**Check service status:**
+```bash
+curl http://localhost:5060/info
+```
+### 2. Basic PDF Analysis
+**Analyze a PDF document (VGT model - high accuracy):**
+```bash
+curl -X POST -F 'file=@/path/to/your/document.pdf' http://localhost:5060
+```
+**Fast analysis (LightGBM models - faster processing):**
+```bash
+curl -X POST -F 'file=@/path/to/your/document.pdf' -F "fast=true" http://localhost:5060
+```
+### 3. Stop the Service
+```bash
+make stop
+```
+> 💡 **Tip**: Replace `/path/to/your/document.pdf` with the actual path to your PDF file. The service will return a JSON response with segmented content and metadata.
+## 📋 Table of Contents
+- [🚀 Quick Start](#-quick-start)
+- [⚙️ Dependencies](#-dependencies)
+- [📋 Requirements](#-requirements)
+- [📚 API Reference](#-api-reference)
+- [💡 Usage Examples](#-usage-examples)
+- [🏗️ Architecture](#-architecture)
+- [🤖 Models](#-models)
+- [📊 Data](#-data)
+- [🔧 Development](#-development)
+- [📈 Benchmarks](#-benchmarks)
+  - [Performance](#performance)
+  - [Speed](#speed)
+- [🌐 Installation of More Languages for OCR](#-installation-of-more-languages-for-ocr)
+- [🔗 Related Services](#-related-services)
+- [🤝 Contributing](#-contributing)
+## ⚙️ Dependencies
+### Required
+- **Docker Desktop 4.25.0+** - [Installation Guide](https://www.docker.com/products/docker-desktop/)
+- **Python 3.10+** (for local development)
+### Optional
+- **NVIDIA Container Toolkit** - [Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (for GPU support)
+## 📋 Requirements
+### System Requirements
+- **RAM**: 2 GB minimum
+- **GPU Memory**: 5 GB (optional, will fallback to CPU if unavailable)
+- **Disk Space**: 10 GB for models and dependencies
+- **CPU**: Multi-core recommended for better performance
+### Docker Requirements
+- Docker Engine 20.10+
+- Docker Compose 2.0+
+## 📚 API Reference
+The service provides a comprehensive RESTful API with the following endpoints:
+### Core Analysis Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
+| `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
+| `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
+### Content Extraction Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/text` | POST | Extract text by content types | `file`, `fast`, `types` |
+| `/toc` | POST | Extract table of contents | `file`, `fast` |
+| `/toc_legacy_uwazi_compatible` | POST | Extract TOC (Uwazi compatible) | `file` |
+### Format Conversion Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
+### OCR & Utility Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/ocr` | POST | Apply OCR to PDF | `file`, `language` |
+| `/info` | GET | Get service information | - |
+| `/` | GET | Health check and system info | - |
+| `/error` | GET | Test error handling | - |
+### Common Parameters
+- **`file`**: PDF file to process (multipart/form-data)
+- **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
+- **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
+- **`language`**: OCR language code (string, default: "en")
+- **`types`**: Comma-separated content types to extract (string, default: "all")
+- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
+- **`dpi`**: Image resolution for conversion (integer, default: 120)
+## 💡 Usage Examples
+### Basic PDF Analysis
+**Standard analysis with VGT model:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060
+```
+**Fast analysis with LightGBM models:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'fast=true' \
+  http://localhost:5060
+```
+**Analysis with table and math parsing:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'parse_tables_and_math=true' \
+  http://localhost:5060
+```
+### Text Extraction
+**Extract all text:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'types=all' \
+  http://localhost:5060/text
+```
+**Extract specific content types:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'types=title,text,table' \
+  http://localhost:5060/text
+```
+### Format Conversion
+**Convert to Markdown:**
+```bash
+curl -X POST http://localhost:5060/markdown \
+  -F 'file=@document.pdf' \
+  -F 'extract_toc=true' \
+  -F 'output_file=document.md' \
+  --output 'document.zip'
+```
+**Convert to HTML:**
+```bash
+curl -X POST http://localhost:5060/html \
+  -F 'file=@document.pdf' \
+  -F 'extract_toc=true' \
+  -F 'output_file=document.html' \
+  --output 'document.zip'
+```
+> **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
+> - **Coordinates**: `left`, `top`, `width`, `height`
+> - **Page information**: `page_number`, `page_width`, `page_height`
+> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+### OCR Processing
+**OCR in English:**
+```bash
+curl -X POST \
+  -F 'file=@scanned_document.pdf' \
+  -F 'language=en' \
+  http://localhost:5060/ocr \
+  --output ocr_processed.pdf
+```
+**OCR in other languages:**
+```bash
+# French
+curl -X POST \
+  -F 'file=@document_french.pdf' \
+  -F 'language=fr' \
+  http://localhost:5060/ocr \
+  --output ocr_french.pdf
+# Spanish
+curl -X POST \
+  -F 'file=@document_spanish.pdf' \
+  -F 'language=es' \
+  http://localhost:5060/ocr \
+  --output ocr_spanish.pdf
+```
+### Visualization
+**Generate visualization PDF:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/visualize \
+  --output visualization.pdf
+```
+### Table of Contents Extraction
+**Extract structured TOC:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/toc
+```
+### XML Storage and Retrieval
+**Analyze and save XML:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/save_xml/my_analysis
+```
+**Retrieve saved XML:**
+```bash
+curl http://localhost:5060/get_xml/my_analysis.xml
+```
+### Service Information
+**Get service info and supported languages:**
+```bash
+curl http://localhost:5060/info
+```
+**Health check:**
+```bash
+curl http://localhost:5060/
+```
+### Response Format
+Most endpoints return JSON with segment information:
+```json
+[
+  {
+    "left": 72.0,
+    "top": 84.0,
+    "width": 451.2,
+    "height": 23.04,
+    "page_number": 1,
+    "page_width": 595.32,
+    "page_height": 841.92,
+    "text": "Document Title",
+    "type": "Title"
+  },
+  {
+    "left": 72.0,
+    "top": 120.0,
+    "width": 451.2,
+    "height": 200.0,
+    "page_number": 1,
+    "page_width": 595.32,
+    "page_height": 841.92,
+    "text": "This is the main text content...",
+    "type": "Text"
+  }
+]
+```
+### Supported Content Types
+- `Caption` - Image and table captions
+- `Footnote` - Footnote text
+- `Formula` - Mathematical formulas
+- `List item` - List items and bullet points
+- `Page footer` - Footer content
+- `Page header` - Header content
+- `Picture` - Images and figures
+- `Section header` - Section headings
+- `Table` - Table content
+- `Text` - Regular text paragraphs
+- `Title` - Document and section titles
+## 🏗️ Architecture
+This project follows **Clean Architecture** principles, ensuring separation of concerns, testability, and maintainability. The codebase is organized into distinct layers:
+### Directory Structure
+```
+src/
+├── domain/                 # Enterprise Business Rules
+│   ├── PdfImages.py       # PDF image handling domain logic
+│   ├── PdfSegment.py      # PDF segment entity
+│   ├── Prediction.py      # ML prediction entity
+│   └── SegmentBox.py      # Core segment box entity
+├── use_cases/             # Application Business Rules
+│   ├── pdf_analysis/      # PDF analysis use case
+│   ├── text_extraction/   # Text extraction use case
+│   ├── toc_extraction/    # Table of contents extraction
+│   ├── visualization/     # PDF visualization use case
+│   ├── ocr/              # OCR processing use case
+│   ├── markdown_conversion/ # Markdown conversion use case
+│   └── html_conversion/   # HTML conversion use case
+├── adapters/              # Interface Adapters
+│   ├── infrastructure/    # External service adapters
+│   ├── ml/               # Machine learning model adapters
+│   ├── storage/          # File storage adapters
+│   └── web/              # Web framework adapters
+├── ports/                 # Interface definitions
+│   ├── services/         # Service interfaces
+│   └── repositories/     # Repository interfaces
+└── drivers/              # Frameworks & Drivers
+    └── web/              # FastAPI application setup
+```
+### Layer Responsibilities
+- **Domain Layer**: Contains core business entities and rules independent of external concerns
+- **Use Cases Layer**: Orchestrates domain entities to fulfill specific application requirements
+- **Adapters Layer**: Implements interfaces defined by inner layers and adapts external frameworks
+- **Drivers Layer**: Contains frameworks, databases, and external agency configurations
+### Key Benefits
+- 🔄 **Dependency Inversion**: High-level modules don't depend on low-level modules
+- 🧪 **Testability**: Easy to unit test business logic in isolation
+- 🔧 **Maintainability**: Changes to external frameworks don't affect business rules
+- 📈 **Scalability**: Easy to add new features without modifying existing code
+## 🤖 Models
+The service offers two complementary model approaches, each optimized for different use cases:
+### 1. Vision Grid Transformer (VGT) - High Accuracy Model
+**Overview**: A state-of-the-art visual model developed by Alibaba Research Group that "sees" the entire page layout.
+**Key Features**:
+- 🎯 **High Accuracy**: Best-in-class performance on document layout analysis
+- 👁️ **Visual Understanding**: Analyzes the entire page context including spatial relationships
+- 📊 **Trained on DocLayNet**: Uses the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet)
+- 🔬 **Research-Backed**: Based on [Advanced Literate Machinery](https://github.com/AlibabaResearch/AdvancedLiterateMachinery)
+**Resource Requirements**:
+- GPU: 5GB+ VRAM (recommended)
+- CPU: Falls back automatically if GPU unavailable
+- Processing Speed: ~1.75 seconds/page (GPU [GTX 1070]) or ~13.5 seconds/page (CPU [i7-8700])
+### 2. LightGBM Models - Fast & Efficient
+**Overview**: Lightweight ensemble of two specialized models using XML-based features from Poppler.
+**Key Features**:
+- ⚡ **High Speed**: ~0.42 seconds per page on CPU (i7-8700)
+- 💾 **Low Resource Usage**: CPU-only, minimal memory footprint
+- 🔄 **Dual Model Approach**:
+  - **Token Type Classifier**: Identifies content types (title, text, table, etc.)
+  - **Segmentation Model**: Determines proper content boundaries
+- 📄 **XML-Based**: Uses Poppler's PDF-to-XML conversion for feature extraction
+**Trade-offs**:
+- Slightly lower accuracy compared to VGT
+- No visual context understanding
+- Excellent for batch processing and resource-constrained environments
+### OCR Integration
+Both models integrate seamlessly with OCR capabilities:
+- **Engine**: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
+- **Processing**: [ocrmypdf](https://ocrmypdf.readthedocs.io/en/latest/index.html)
+- **Languages**: 150+ supported languages
+- **Output**: Searchable PDFs with preserved layout
+### Model Selection Guide
+| Use Case | Recommended Model | Reason |
+|----------|------------------|---------|
+| High accuracy requirements | VGT | Superior visual understanding |
+| Batch processing | LightGBM | Faster processing, lower resources |
+| GPU available | VGT | Leverages GPU acceleration |
+| CPU-only environment | LightGBM | Optimized for CPU processing |
+| Real-time applications | LightGBM | Consistent fast response times |
+| Research/analysis | VGT | Best accuracy for detailed analysis |
+## 📊 Data
+### Training Dataset
+Both model types are trained on the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet), a large-scale document layout analysis dataset containing over 80,000 document pages.
+### Document Categories
+The models can identify and classify 11 distinct content types:
+| ID | Category | Description |
+|----|----------|-------------|
+| 1 | **Caption** | Image and table captions |
+| 2 | **Footnote** | Footnote references and text |
+| 3 | **Formula** | Mathematical equations and formulas |
+| 4 | **List item** | Bulleted and numbered list items |
+| 5 | **Page footer** | Footer content and page numbers |
+| 6 | **Page header** | Header content and titles |
+| 7 | **Picture** | Images, figures, and graphics |
+| 8 | **Section header** | Section and subsection headings |
+| 9 | **Table** | Tabular data and structures |
+| 10 | **Text** | Regular paragraph text |
+| 11 | **Title** | Document and chapter titles |
+### Dataset Characteristics
+- **Domain Coverage**: Academic papers, technical documents, reports
+- **Language**: Primarily English with multilingual support
+- **Quality**: High-quality annotations with bounding boxes and labels
+- **Diversity**: Various document layouts, fonts, and formatting styles
+For detailed information about the dataset, visit the [DocLayNet repository](https://github.com/DS4SD/DocLayNet).
+## 🔧 Development
+### Local Development Setup
+1. **Clone the repository:**
+   ```bash
+   git clone https://github.com/huridocs/pdf-document-layout-analysis.git
+   cd pdf-document-layout-analysis
+   ```
+2. **Create virtual environment:**
+   ```bash
+   make install_venv
+   ```
+3. **Activate environment:**
+   ```bash
+   make activate
+   # or manually: source .venv/bin/activate
+   ```
+4. **Install dependencies:**
+   ```bash
+   make install
+   ```
+### Code Quality
+**Format code:**
+```bash
+make formatter
+```
+**Check formatting:**
+```bash
+make check_format
+```
+### Testing
+**Run tests:**
+```bash
+make test
+```
+**Integration tests:**
+```bash
+# Tests are located in src/tests/integration/
+python -m pytest src/tests/integration/test_end_to_end.py
+```
+### Docker Development
+**Build and start (detached mode):**
+```bash
+# With GPU
+make start_detached_gpu
+# Without GPU
+make start_detached
+```
+**Clean up Docker resources:**
+```bash
+# Remove containers
+make remove_docker_containers
+# Remove images
+make remove_docker_images
+```
+### Project Structure
+```
+pdf-document-layout-analysis/
+├── src/                    # Source code
+│   ├── domain/            # Business entities
+│   ├── use_cases/         # Application logic
+│   ├── adapters/          # External integrations
+│   ├── ports/             # Interface definitions
+│   └── drivers/           # Framework configurations
+├── test_pdfs/             # Test PDF files
+├── models/                # ML model storage
+├── docker-compose.yml     # Docker configuration
+├── Dockerfile             # Container definition
+├── Makefile              # Development commands
+├── pyproject.toml        # Python project configuration
+└── requirements.txt      # Python dependencies
+```
+### Environment Variables
+Key configuration options:
+```bash
+# OCR configuration
+OCR_SOURCE=/tmp/ocr_source
+# Model paths (auto-configured)
+MODELS_PATH=./models
+# Service configuration
+HOST=0.0.0.0
+PORT=5060
+```
+### Adding New Features
+1. **Domain Logic**: Add entities in `src/domain/`
+2. **Use Cases**: Implement business logic in `src/use_cases/`
+3. **Adapters**: Create integrations in `src/adapters/`
+4. **Ports**: Define interfaces in `src/ports/`
+5. **Controllers**: Add endpoints in `src/adapters/web/`
+### Debugging
+**View logs:**
+```bash
+docker compose logs -f
+```
+**Access container:**
+```bash
+docker exec -it pdf-document-layout-analysis /bin/bash
+```
+**Free up disk space:**
+```bash
+make free_up_space
+```
+### Order of Output Elements
+The service returns SegmentBox elements in a carefully determined reading order:
+#### Reading Order Algorithm
+1. **Poppler Integration**: Uses [Poppler](https://poppler.freedesktop.org) PDF-to-XML conversion to establish initial token reading order
+2. **Segment Averaging**: Calculates average reading order for multi-token segments
+3. **Type-Based Sorting**: Prioritizes content types:
+   - **Headers** placed first
+   - **Main content** in reading order
+   - **Footers and footnotes** placed last
+#### Non-Text Elements
+For segments without text (e.g., images):
+- Processed after text-based sorting
+- Positioned based on nearest text segment proximity
+- Uses spatial distance as the primary criterion
+### Advanced Table and Formula Extraction
+#### Default Behavior
+- **Formulas**: Automatically extracted as LaTeX format in the `text` property
+- **Tables**: Basic text extraction included by default
+#### Enhanced Table Extraction
+Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
+```bash
+curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
+```
+#### Extraction Engines
+- **Formulas**: [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
+- **Tables**: [RapidTable](https://github.com/RapidAI/RapidTable)
+## 📈 Benchmarks
+### Performance
+VGT model performance on PubLayNet dataset:
+| Metric | Overall | Text | Title | List | Table | Figure |
+|--------|---------|------|-------|------|-------|--------|
+| **F1 Score** | **0.962** | 0.950 | 0.939 | 0.968 | 0.981 | 0.971 |
+> 📊 **Comparison**: View comprehensive model comparisons at [Papers With Code](https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val)
+### Speed
+Performance benchmarks on 15-page academic documents:
+| Model | Hardware | Speed (sec/page) | Use Case |
+|-------|----------|------------------|----------|
+| **LightGBM** | CPU (i7-8700 3.2GHz) | **0.42** | Fast processing |
+| **VGT** | GPU (GTX 1070) | **1.75** | High accuracy |
+| **VGT** | CPU (i7-8700 3.2GHz) | 13.5 | CPU fallback |
+### Performance Recommendations
+- **GPU Available**: Use VGT for best accuracy-speed balance
+- **CPU Only**: Use LightGBM for optimal performance
+- **Batch Processing**: LightGBM for consistent throughput
+- **High Accuracy**: VGT with GPU for best results
+## 🌐 Installation of More Languages for OCR
+The service uses Tesseract OCR with support for 150+ languages. The Docker image includes only common languages to minimize image size.
+### Installing Additional Languages
+#### 1. Access the Container
+```bash
+docker exec -it --user root pdf-document-layout-analysis /bin/bash
+```
+#### 2. Install Language Packs
+```bash
+# Install specific language
+apt-get update
+apt-get install tesseract-ocr-[LANGCODE]
+```
+#### 3. Common Language Examples
+```bash
+# Korean
+apt-get install tesseract-ocr-kor
+# German
+apt-get install tesseract-ocr-deu
+# French
+apt-get install tesseract-ocr-fra
+# Spanish
+apt-get install tesseract-ocr-spa
+# Chinese Simplified
+apt-get install tesseract-ocr-chi-sim
+# Arabic
+apt-get install tesseract-ocr-ara
+# Japanese
+apt-get install tesseract-ocr-jpn
+```
+#### 4. Verify Installation
+```bash
+curl http://localhost:5060/info
+```
+### Language Code Reference
+Find Tesseract language codes in the [ISO to Tesseract mapping](https://github.com/huridocs/pdf-document-layout-analysis/blob/main/src/adapters/infrastructure/ocr/languages.py).
+### Supported Languages
+Common language codes:
+- `eng` - English
+- `fra` - French
+- `deu` - German
+- `spa` - Spanish
+- `ita` - Italian
+- `por` - Portuguese
+- `rus` - Russian
+- `chi-sim` - Chinese Simplified
+- `chi-tra` - Chinese Traditional
+- `jpn` - Japanese
+- `kor` - Korean
+- `ara` - Arabic
+- `hin` - Hindi
+### Usage with Multiple Languages
+```bash
+# OCR with specific language
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'language=fr' \
+  http://localhost:5060/ocr \
+  --output french_ocr.pdf
+```
+## 🔗 Related Services
+Explore our ecosystem of PDF processing services built on this foundation:
+### [PDF Table of Contents Extractor](https://github.com/huridocs/pdf-table-of-contents-extractor)
+🔍 **Purpose**: Intelligent extraction of structured table of contents from PDF documents
+**Key Features**:
+- Leverages layout analysis for accurate TOC identification
+- Hierarchical structure recognition
+- Multiple output formats supported
+- Integration-ready API
+### [PDF Text Extraction](https://github.com/huridocs/pdf-text-extraction)
+📝 **Purpose**: Advanced text extraction with layout awareness
+**Key Features**:
+- Content-type aware extraction
+- Preserves document structure
+- Reading order optimization
+- Clean text output with metadata
+### Integration Benefits
+These services work seamlessly together:
+- **Shared Analysis**: Reuse layout analysis results across services
+- **Consistent Output**: Standardized JSON format for easy integration
+- **Scalable Architecture**: Deploy services independently or together
+- **Docker Ready**: All services containerized for easy deployment
+## 🤝 Contributing
+We welcome contributions to improve the PDF Document Layout Analysis service!
+### How to Contribute
+1. **Fork the Repository**
+   ```bash
+   git clone https://github.com/your-username/pdf-document-layout-analysis.git
+   ```
+2. **Create a Feature Branch**
+   ```bash
+   git checkout -b feature/your-feature-name
+   ```
+3. **Set Up Development Environment**
+   ```bash
+   make install_venv
+   make install
+   ```
+4. **Make Your Changes**
+   - Follow the Clean Architecture principles
+   - Add tests for new features
+   - Update documentation as needed
+5. **Run Tests and Quality Checks**
+   ```bash
+   make test
+   make check_format
+   ```
+6. **Submit a Pull Request**
+   - Provide clear description of changes
+   - Include test results
+   - Reference any related issues
+### Contribution Guidelines
+#### Code Standards
+- **Python**: Follow PEP 8 with 125-character line length
+- **Architecture**: Maintain Clean Architecture boundaries
+- **Testing**: Include unit tests for new functionality
+- **Documentation**: Update README and docstrings
+#### Areas for Contribution
+- 🐛 **Bug Fixes**: Report and fix issues
+- ✨ **New Features**: Add new endpoints or functionality
+- 📚 **Documentation**: Improve guides and examples
+- 🧪 **Testing**: Expand test coverage
+- 🚀 **Performance**: Optimize processing speed
+- 🌐 **Internationalization**: Add language support
+#### Development Workflow
+1. **Issue First**: Create or comment on relevant issues
+2. **Small PRs**: Keep pull requests focused and manageable
+3. **Clean Commits**: Use descriptive commit messages
+4. **Documentation**: Update relevant documentation
+5. **Testing**: Ensure all tests pass
+### Getting Help
+- 📚 **Documentation**: Check this README and inline docs
+- 💬 **Issues**: Search existing issues or create new ones
+- 🔍 **Code**: Explore the codebase structure
+- 📧 **Contact**: Reach out to maintainers for guidance
+---
+### License
+This project is licensed under the terms specified in the [LICENSE](LICENSE) file.

app.py CHANGED Viewed

@@ -3,34 +3,108 @@ import tempfile
 import os
 import shutil
 import subprocess
 def process_files(pdf_file, word_file):
     # Create a unique temporary directory for this run
     temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
     # Define standard filenames for use in the pipeline
     pdf_path = os.path.join(temp_dir, "input.pdf")
     word_path = os.path.join(temp_dir, "input.docx")
-    pdf_txt_path = os.path.join(temp_dir, "pdf_data.txt")
     word_json_path = os.path.join(temp_dir, "word_data.json")
     updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
     final_docx_path = os.path.join(temp_dir, "updated.docx")
     # Copy the uploaded files to the temp directory
     shutil.copy(pdf_file, pdf_path)
     shutil.copy(word_file, word_path)
-    # Step 1: Extract text from the PDF
-    subprocess.run(["python", "extract_pdf_data.py", pdf_path, pdf_txt_path], check=True)
-    # Step 2: Extract red text from the Word document
-    subprocess.run(["python", "extract_red_text.py", word_path, word_json_path], check=True)
-    # Step 3: Update the Word JSON using the PDF text (calls OpenAI)
-    subprocess.run(["python", "update_docx_with_pdf.py", word_json_path, pdf_txt_path, updated_json_path], check=True)
-    # Step 4: Apply the updated JSON to the Word doc to create the final output
-    subprocess.run(["python", "updated_word.py", word_path, updated_json_path, final_docx_path], check=True)
     # Return the final .docx file
     return final_docx_path

 import os
 import shutil
 import subprocess
+from pathlib import Path
+SCRIPT_DIR = Path(__file__).resolve().parent
+def run_cmd(cmd, cwd=None, env=None):
+    """Run a command, print nice logs, and also save them to run.log in cwd."""
+    cwd = str(cwd or os.getcwd())
+    print(f"🟦 Running: {' '.join(cmd)}  (cwd={cwd})")
+    proc = subprocess.run(
+        cmd,
+        cwd=cwd,
+        env=env,
+        capture_output=True,
+        text=True
+    )
+    if proc.stdout:
+        print("🟩 STDOUT:")
+        print(proc.stdout)
+    if proc.stderr:
+        print("🟥 STDERR:")
+        print(proc.stderr)
+    # Save to run.log for debugging
+    try:
+        runlog = Path(cwd) / "run.log"
+        with open(runlog, "a", encoding="utf-8") as f:
+            f.write(f"$ {' '.join(cmd)}\n")
+            if proc.stdout:
+                f.write(proc.stdout + "\n")
+            if proc.stderr:
+                f.write(proc.stderr + "\n")
+        print(f"🧾 Run log saved to: {runlog}")
+    except Exception as e:
+        print(f"⚠️ Could not write run.log: {e}")
+    if proc.returncode != 0:
+        # Let Gradio see the failure so it surfaces properly
+        raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
+    return proc
+def _locate_pdf_json(temp_dir: str) -> str:
+    """
+    Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
+    Find it (and a few common fallbacks). Raise if not found.
+    """
+    td = Path(temp_dir)
+    # Prefer exactly-named file if present
+    candidates = [
+        td / "pdf_data.json",                    # legacy name (if ever created)
+        td / "input_comprehensive_data.json",    # most common from your logs
+        td / "comprehensive_data.json",          # another common alias
+        td / "output.json",                      # generic
+    ]
+    for p in candidates:
+        if p.exists():
+            print(f"✅ Using PDF JSON: {p}")
+            return str(p)
+    # Generic pattern: anything *_comprehensive_data.json
+    globs = list(td.glob("*_comprehensive_data.json"))
+    if globs:
+        print(f"✅ Using PDF JSON (glob): {globs[0]}")
+        return str(globs[0])
+    # If still not found, surface a helpful error
+    searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
+    raise FileNotFoundError(
+        f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
+    )
 def process_files(pdf_file, word_file):
     # Create a unique temporary directory for this run
     temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
+    print(f"📂 Temp dir: {temp_dir}")
     # Define standard filenames for use in the pipeline
     pdf_path = os.path.join(temp_dir, "input.pdf")
     word_path = os.path.join(temp_dir, "input.docx")
     word_json_path = os.path.join(temp_dir, "word_data.json")
     updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
     final_docx_path = os.path.join(temp_dir, "updated.docx")
     # Copy the uploaded files to the temp directory
     shutil.copy(pdf_file, pdf_path)
+    print(f"📄 PDF copied to: {pdf_path}")
     shutil.copy(word_file, word_path)
+    print(f"📝 DOCX copied to: {word_path}")
+    # 1) PDF → JSON  (extractor writes <stem>_comprehensive_data.json into cwd)
+    run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)
+    # Find the JSON produced by the extractor
+    pdf_json_path = _locate_pdf_json(temp_dir)
+    # 2) DOCX red text → JSON
+    run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)
+    # 3) Merge JSON (uses the resolved pdf_json_path)
+    run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)
+    # 4) Apply updates to DOCX
+    run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)
     # Return the final .docx file
     return final_docx_path

dev-requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+-r requirements.txt
+pytest==8.2.2
+black==24.4.2
+pip-upgrader==1.4.15

docker-compose-gpu.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+services:
+  pdf-document-layout-analysis-gpu:
+    extends:
+      file: docker-compose.yml
+      service: pdf-document-layout-analysis
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [ gpu ]
+    environment:
+      - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+services:
+  pdf-document-layout-analysis:
+    container_name: pdf-document-layout-analysis
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
+    init: true
+    restart: unless-stopped
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "5060:5060"

extract_pdf_data.py CHANGED Viewed

@@ -1,39 +1,534 @@
-import pdfplumber
-from pdf2image import convert_from_path
-import pytesseract
-def extract_pdf_full_text(pdf_path, txt_path):
-    raw_texts = []
-    need_ocr = []
-    # Step 1: Try to extract RAW text, record which pages need OCR
-    with pdfplumber.open(pdf_path) as pdf:
-        for i, page in enumerate(pdf.pages):
-            print(f"Extracting text from page {i+1}...")
-            text = page.extract_text() or ""
-            if text.strip():
-                raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
-            else:
-                raw_texts.append(None)
-                # Mark that we need OCR for this page
-                need_ocr.append(i)
-    # Step 2: OCR only those pages with no RAW text
-    print("Running OCR where RAW text is missing...")
-    images = convert_from_path(pdf_path, dpi=300)
-    for idx in need_ocr:
-        ocr_text = pytesseract.image_to_string(images[idx])
-        raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
-    # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
-    result = [txt for txt in raw_texts if txt]
-    with open(txt_path, "w", encoding="utf-8") as f:
-        f.write("\n".join(result))
-    print(f"✅ Saved deduped full text to {txt_path}")
 if __name__ == "__main__":
-    import sys
-    # Usage: python extract_pdf_data.py input.pdf output.txt
-    input_pdf = sys.argv[1]
-    output_txt = sys.argv[2]
-    extract_pdf_full_text(input_pdf, output_txt)

+#!/usr/bin/env python3
+"""
+Fixed PDF Data Extractor - Addresses key issues in comprehensive_extract.py
+Key fixes:
+1. Better table extraction and cleaning
+2. Improved key-value pair extraction
+3. More robust text processing
+4. Enhanced vehicle registration extraction
+5. Better date/number pattern recognition
+"""
+import json
+import re
+import pandas as pd
+from typing import Dict, List, Any, Optional
+import logging
+from pathlib import Path
+import sys
+from datetime import datetime
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fixed_pdf_extractor")
+class FixedPDFExtractor:
+    def __init__(self):
+        logger.info("🚀 Initializing Fixed PDF Extractor")
+    def extract_everything(self, pdf_path: str) -> Dict[str, Any]:
+        if not HAS_PDFPLUMBER:
+            raise RuntimeError("pdfplumber is required. Install with: pip install pdfplumber")
+        logger.info(f"📖 Processing PDF: {pdf_path}")
+        result = {
+            "document_info": {
+                "filename": Path(pdf_path).name,
+                "total_pages": 0,
+                "extraction_timestamp": datetime.now().isoformat()
+            },
+            "extracted_data": {
+                "all_text_content": [],
+                "all_tables": [],
+                "key_value_pairs": {},
+                "audit_information": {},
+                "operator_information": {},
+                "vehicle_registrations": [],
+                "driver_records": [],
+                "compliance_summary": {},
+                "dates_and_numbers": {}
+            }
+        }
+        all_text_blocks, all_tables = [], []
+        with pdfplumber.open(pdf_path) as pdf:
+            result["document_info"]["total_pages"] = len(pdf.pages)
+            for page_num, page in enumerate(pdf.pages, 1):
+                logger.info(f"📄 Processing page {page_num}")
+                # Extract text with better handling
+                page_text = self._extract_page_text(page)
+                if page_text:
+                    all_text_blocks.append({
+                        "page": page_num,
+                        "text": page_text,
+                        "word_count": len(page_text.split())
+                    })
+                # Extract tables with improved cleaning
+                tables = self._extract_page_tables(page, page_num)
+                all_tables.extend(tables)
+        result["extracted_data"]["all_text_content"] = all_text_blocks
+        result["extracted_data"]["all_tables"] = all_tables
+        # Process extracted data with improved methods
+        combined_text = "\n\n".join(b["text"] for b in all_text_blocks)
+        result["extracted_data"]["key_value_pairs"] = self._extract_key_value_pairs_improved(combined_text)
+        result["extracted_data"]["audit_information"] = self._extract_audit_info(combined_text, all_tables)
+        result["extracted_data"]["operator_information"] = self._extract_operator_info(combined_text, all_tables)
+        result["extracted_data"]["vehicle_registrations"] = self._extract_vehicle_registrations(all_tables)
+        result["extracted_data"]["driver_records"] = self._extract_driver_records(all_tables)
+        result["extracted_data"]["compliance_summary"] = self._extract_compliance_summary(combined_text, all_tables)
+        result["extracted_data"]["dates_and_numbers"] = self._extract_dates_and_numbers_improved(combined_text)
+        # Generate summary
+        result["extraction_summary"] = {
+            "text_blocks_found": len(all_text_blocks),
+            "tables_found": len(all_tables),
+            "key_value_pairs_found": len(result["extracted_data"]["key_value_pairs"]),
+            "vehicle_registrations_found": len(result["extracted_data"]["vehicle_registrations"]),
+            "driver_records_found": len(result["extracted_data"]["driver_records"]),
+            "total_characters": len(combined_text),
+            "processing_timestamp": datetime.now().isoformat()
+        }
+        logger.info("✅ Extraction completed!")
+        return result
+    def _extract_page_text(self, page) -> Optional[str]:
+        """Extract text from page with better handling"""
+        try:
+            text = page.extract_text()
+            if text:
+                # Clean up text
+                text = re.sub(r'[ \t]+', ' ', text.strip())
+                text = re.sub(r'\n\s*\n', '\n', text)
+                return text
+        except Exception as e:
+            logger.warning(f"Failed to extract text from page: {e}")
+        return None
+    def _extract_page_tables(self, page, page_num: int) -> List[Dict]:
+        """Extract tables with improved processing"""
+        tables = []
+        try:
+            raw_tables = page.extract_tables()
+            if raw_tables:
+                for table_idx, table in enumerate(raw_tables):
+                    cleaned_table = self._clean_table_improved(table)
+                    if cleaned_table and len(cleaned_table) > 0:
+                        tables.append({
+                            "page": page_num,
+                            "table_index": table_idx + 1,
+                            "headers": cleaned_table[0] if cleaned_table else [],
+                            "data": cleaned_table[1:] if len(cleaned_table) > 1 else [],
+                            "raw_data": cleaned_table,
+                            "row_count": len(cleaned_table) - 1 if len(cleaned_table) > 1 else 0,
+                            "column_count": len(cleaned_table[0]) if cleaned_table else 0
+                        })
+        except Exception as e:
+            logger.warning(f"Failed to extract tables from page {page_num}: {e}")
+        return tables
+    def _clean_table_improved(self, table: List[List]) -> List[List[str]]:
+        """Improved table cleaning with better cell processing"""
+        if not table:
+            return []
+        cleaned = []
+        for row in table:
+            cleaned_row = []
+            for cell in row:
+                if cell is None:
+                    cleaned_cell = ""
+                else:
+                    cleaned_cell = str(cell).strip()
+                    cleaned_cell = re.sub(r'\s+', ' ', cleaned_cell)
+                    cleaned_cell = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', cleaned_cell)
+                cleaned_row.append(cleaned_cell)
+            if any(cell.strip() for cell in cleaned_row):
+                cleaned.append(cleaned_row)
+        # Optional: collapse single-column tables of empty strings
+        if cleaned and all(len(r) == len(cleaned[0]) for r in cleaned):
+            return cleaned
+        return cleaned
+    def _extract_key_value_pairs_improved(self, text: str) -> Dict[str, str]:
+        """Improved key-value pair extraction with better cleaning"""
+        pairs: Dict[str, str] = {}
+        # Normalize text a bit for regex stability
+        t = text.replace('\r', '\n')
+        # Pattern 1: colon-separated pairs (key: value)
+        pattern1 = re.compile(
+            r'([A-Za-z][\w\s()/\-.]{2,80}?):\s*([^\n\r:][^\n\r]*)'
+        )
+        for key, val in pattern1.findall(t):
+            k = key.strip()
+            v = val.strip()
+            # Filter junk: very long values, pure separators, or obvious headers
+            if not v or len(v) > 200:
+                continue
+            if re.fullmatch(r'[-_/\.]+', v):
+                continue
+            # Avoid capturing the next key as value by trimming trailing key-like tokens
+            v = re.sub(r'\s+[A-Z][\w\s()/\-.]{2,40}:$', '', v).strip()
+            # Skip values that are just long digit runs (likely id lists without meaning)
+            if re.fullmatch(r'\d{6,}', v):
+                continue
+            pairs[k] = v
+        # Pattern 2: inline “Key – Value” or “Key — Value”
+        pattern2 = re.compile(r'([A-Za-z][\w\s()/\-.]{2,80}?)\s*[–—-]\s*([^\n\r]+)')
+        for key, val in pattern2.findall(t):
+            k = key.strip()
+            v = val.strip()
+            if v and len(v) <= 200 and not re.fullmatch(r'\d{6,}', v):
+                pairs.setdefault(k, v)
+        return pairs
+    def _extract_audit_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract audit-specific information with better filtering"""
+        audit_info: Dict[str, Any] = {}
+        # Prefer tables
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            joined = ' '.join(headers)
+            if "audit information" in joined or "auditinformation" in joined:
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2 and row[0] and row[1]:
+                        key = str(row[0]).strip()
+                        value = str(row[1]).strip()
+                        # Skip numbered list rows (e.g., "1.", "2)")
+                        if re.match(r'^\s*\d+\s*[.)]\s*$', key):
+                            continue
+                        if key and value:
+                            audit_info[key] = value
+        # Backup from text
+        candidates = {
+            "Date of Audit": r'Date\s+of\s+Audit[:\s]*([^\n\r]+)',
+            "Location of audit": r'Location\s+of\s+audit[:\s]*([^\n\r]+)',
+            "Auditor name": r'Auditor\s+name[:\s]*([^\n\r]+)',
+            "Audit Matrix Identifier (Name or Number)": r'Audit\s+Matrix\s+Identifier.*?[:\s]*([^\n\r]+)',
+        }
+        for k, pat in candidates.items():
+            if k not in audit_info:
+                m = re.search(pat, text, re.IGNORECASE)
+                if m:
+                    audit_info[k] = m.group(1).strip()
+        return audit_info
+    def _extract_operator_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract operator information with better table parsing"""
+        operator_info: Dict[str, Any] = {}
+        # Look for operator information in tables first
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            if ("operatorinformation" in ' '.join(headers) or
+                "operator information" in ' '.join(headers) or
+                "operatorcontactdetails" in ' '.join(headers)):
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2 and row[0] and row[1]:
+                        key = str(row[0]).strip()
+                        value = str(row[1]).strip()
+                        if key and value:
+                            # Clean up key names
+                            kl = key.lower()
+                            if "operator name" in kl:
+                                operator_info["operator_name"] = value
+                            elif "trading name" in kl:
+                                operator_info["trading_name"] = value
+                            elif "company number" in kl:
+                                if len(row) > 2:
+                                    company_parts = [str(r).strip() for r in row[1:] if str(r).strip()]
+                                    operator_info["company_number"] = "".join(company_parts)
+                                else:
+                                    operator_info["company_number"] = value
+                            elif "business address" in kl:
+                                operator_info["business_address"] = value
+                            elif "postal address" in kl:
+                                operator_info["postal_address"] = value
+                            elif "email" in kl:
+                                operator_info["email"] = value
+                            elif "telephone" in kl or "phone" in kl:
+                                operator_info["phone"] = value
+                            elif "nhvas accreditation" in kl:
+                                operator_info["nhvas_accreditation"] = value
+                            elif "nhvas manual" in kl:
+                                operator_info["nhvas_manual"] = value
+        # Extract from text patterns as backup
+        patterns = {
+            'operator_name': r'Operator\s*name[:\s\(]*([^\n\r\)]+?)(?=\s*NHVAS|\s*Registered|$)',
+            'trading_name': r'Registered\s*trading\s*name[:\s\/]*([^\n\r]+?)(?=\s*Australian|$)',
+            'company_number': r'Australian\s*Company\s*Number[:\s]*([0-9\s]+?)(?=\s*NHVAS|$)',
+            'business_address': r'Operator\s*business\s*address[:\s]*([^\n\r]+?)(?=\s*Operator\s*Postal|$)',
+            'postal_address': r'Operator\s*Postal\s*address[:\s]*([^\n\r]+?)(?=\s*Email|$)',
+            'email': r'Email\s*address[:\s]*([^\s\n\r]+)',
+            'phone': r'Operator\s*Telephone\s*Number[:\s]*([^\s\n\r]+)',
+            'nhvas_accreditation': r'NHVAS\s*Accreditation\s*No\.[:\s\(]*([^\n\r\)]+)',
+        }
+        for key, pattern in patterns.items():
+            if key not in operator_info:  # Only use text if not found in tables
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    value = match.group(1).strip()
+                    if value and len(value) < 200:
+                        if key == 'company_number':
+                            value = re.sub(r'\s+', '', value)
+                        operator_info[key] = value
+        return operator_info
+    def _extract_vehicle_registrations(self, tables: List[Dict]) -> List[Dict]:
+        """Extract vehicle registration information from tables"""
+        vehicles: List[Dict[str, Any]] = []
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            # Look for vehicle registration tables
+            if any(keyword in ' '.join(headers) for keyword in ['registration', 'vehicle', 'number']):
+                reg_col = None
+                for i, header in enumerate(headers):
+                    if 'registration' in header and 'number' in header:
+                        reg_col = i
+                        break
+                if reg_col is not None:
+                    data = table.get("data", [])
+                    for row in data:
+                        if len(row) > reg_col and row[reg_col]:
+                            reg_num = str(row[reg_col]).strip()
+                            # Validate registration format (letters/numbers)
+                            if re.match(r'^[A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3}$', reg_num):
+                                vehicle_info = {"registration_number": reg_num}
+                                # Add other columns as additional info
+                                for i, header in enumerate(table.get("headers", [])):
+                                    if i < len(row) and i != reg_col:
+                                        vehicle_info[str(header)] = str(row[i]).strip()
+                                vehicles.append(vehicle_info)
+        return vehicles
+    def _extract_driver_records(self, tables: List[Dict]) -> List[Dict]:
+        """Extract driver records from tables"""
+        drivers: List[Dict[str, Any]] = []
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            # Look for driver/scheduler tables
+            if any(keyword in ' '.join(headers) for keyword in ['driver', 'scheduler', 'name']):
+                name_col = None
+                for i, header in enumerate(headers):
+                    if 'name' in header:
+                        name_col = i
+                        break
+                if name_col is not None:
+                    data = table.get("data", [])
+                    for row in data:
+                        if len(row) > name_col and row[name_col]:
+                            name = str(row[name_col]).strip()
+                            # Basic name validation
+                            if re.match(r'^[A-Za-z\s]{2,}$', name) and len(name.split()) >= 2:
+                                driver_info = {"name": name}
+                                # Add other columns
+                                for i, header in enumerate(table.get("headers", [])):
+                                    if i < len(row) and i != name_col:
+                                        driver_info[str(header)] = str(row[i]).strip()
+                                drivers.append(driver_info)
+        return drivers
+    def _extract_compliance_summary(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract compliance information"""
+        compliance = {
+            "standards_compliance": {},
+            "compliance_codes": {},
+            "audit_results": []
+        }
+        # Look for compliance tables
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            if any(keyword in ' '.join(headers) for keyword in ['compliance', 'standard', 'requirement']):
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2:
+                        standard = str(row[0]).strip()
+                        code = str(row[1]).strip()
+                        if standard.startswith('Std') and code in ['V', 'NC', 'SFI', 'NAP', 'NA']:
+                            compliance["standards_compliance"][standard] = code
+        # Extract compliance codes definitions
+        code_patterns = {
+            'V': r'\bV\b\s+([^\n\r]+)',
+            'NC': r'\bNC\b\s+([^\n\r]+)',
+            'SFI': r'\bSFI\b\s+([^\n\r]+)',
+            'NAP': r'\bNAP\b\s+([^\n\r]+)',
+            'NA': r'\bNA\b\s+([^\n\r]+)',
+        }
+        for code, pattern in code_patterns.items():
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                compliance["compliance_codes"][code] = match.group(1).strip()
+        return compliance
+    def _extract_dates_and_numbers_improved(self, text: str) -> Dict[str, Any]:
+        """Improved date and number extraction"""
+        result = {
+            "dates": [],
+            "registration_numbers": [],
+            "phone_numbers": [],
+            "email_addresses": [],
+            "reference_numbers": []
+        }
+        # Date patterns
+        date_patterns = [
+            r'\b(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})\b',
+            r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
+            r'\b(\d{1,2}-\d{1,2}-\d{4})\b',
+            r'\b(\d{1,2}\.\d{1,2}\.\d{4})\b',
+        ]
+        for pattern in date_patterns:
+            result["dates"].extend(re.findall(pattern, text))
+        # Registration numbers (Australian format-ish)
+        reg_pattern = r'\b([A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3})\b'
+        result["registration_numbers"] = list(set(re.findall(reg_pattern, text)))
+        # Phone numbers (AU)
+        phone_pattern = r'\b((?:\+61|0)[2-9]\s?\d{4}\s?\d{4})\b'
+        result["phone_numbers"] = list(set(re.findall(phone_pattern, text)))
+        # Email addresses
+        email_pattern = r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b'
+        result["email_addresses"] = list(set(re.findall(email_pattern, text)))
+        # Reference numbers
+        ref_patterns = [
+            (r'RF(?:S)?\s*#?\s*(\d+)', 'RFS_Certifications'),
+            (r'NHVAS\s+Accreditation\s+No\.?\s*(\d+)', 'NHVAS_Numbers'),
+            (r'Registration\s+Number\s*#?\s*(\d+)', 'Registration_Numbers'),
+        ]
+        for pattern, key in ref_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            if matches:
+                result["reference_numbers"].extend([f"{key}: {m}" for m in matches])
+        return result
+    @staticmethod
+    def save_results(results: Dict[str, Any], output_path: str):
+        """Save results to JSON file"""
+        try:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+            logger.info(f"💾 Results saved to {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to save results: {e}")
+    @staticmethod
+    def export_to_excel(results: Dict[str, Any], excel_path: str):
+        """Export results to Excel with improved formatting"""
+        try:
+            with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
+                # Summary sheet
+                summary_data = []
+                extraction_summary = results.get("extraction_summary", {})
+                for key, value in extraction_summary.items():
+                    summary_data.append({"Metric": key.replace("_", " ").title(), "Value": value})
+                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
+                # Key-value pairs
+                kv_pairs = results.get("extracted_data", {}).get("key_value_pairs", {})
+                if kv_pairs:
+                    kv_df = pd.DataFrame(list(kv_pairs.items()), columns=['Key', 'Value'])
+                    kv_df.to_excel(writer, sheet_name='Key_Value_Pairs', index=False)
+                # Vehicle registrations
+                vehicles = results.get("extracted_data", {}).get("vehicle_registrations", [])
+                if vehicles:
+                    pd.DataFrame(vehicles).to_excel(writer, sheet_name='Vehicle_Registrations', index=False)
+                # Driver records
+                drivers = results.get("extracted_data", {}).get("driver_records", [])
+                if drivers:
+                    pd.DataFrame(drivers).to_excel(writer, sheet_name='Driver_Records', index=False)
+                # Compliance summary
+                compliance = results.get("extracted_data", {}).get("compliance_summary", {})
+                if compliance.get("standards_compliance"):
+                    comp_df = pd.DataFrame(list(compliance["standards_compliance"].items()),
+                                           columns=['Standard', 'Compliance_Code'])
+                    comp_df.to_excel(writer, sheet_name='Compliance_Standards', index=False)
+                logger.info(f"📊 Results exported to Excel: {excel_path}")
+        except Exception as e:
+            logger.error(f"Failed to export to Excel: {e}")
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python fixed_pdf_extractor.py <pdf_path>")
+        sys.exit(1)
+    pdf_path = Path(sys.argv[1])
+    if not pdf_path.exists():
+        print(f"❌ PDF not found: {pdf_path}")
+        sys.exit(1)
+    print("🚀 Fixed PDF Data Extractor")
+    print("=" * 50)
+    extractor = FixedPDFExtractor()
+    results = extractor.extract_everything(str(pdf_path))
+    base = pdf_path.stem
+    output_dir = pdf_path.parent
+    # Save outputs
+    json_path = output_dir / f"{base}_comprehensive_data.json"
+    excel_path = output_dir / f"{base}_fixed_extraction.xlsx"
+    FixedPDFExtractor.save_results(results, str(json_path))
+    FixedPDFExtractor.export_to_excel(results, str(excel_path))
+    print("\n💾 OUTPUT FILES:")
+    print(f"   📄 JSON Data: {json_path}")
+    print(f"   📊 Excel Data: {excel_path}")
+    print(f"\n✨ FIXED EXTRACTION COMPLETE!")
 if __name__ == "__main__":
+    main()

extract_red_text.py CHANGED Viewed

@@ -6,6 +6,139 @@ from docx import Document
 from docx.oxml.ns import qn
 from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
 def is_red_font(run):
     """Enhanced red font detection with better color checking"""
     col = run.font.color
@@ -76,7 +209,6 @@ def calculate_schema_match_score(schema_name, spec, context):
     if "Vehicle Registration" in schema_name:
         vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
         if keyword_matches >= 2:
             score += 150  # Very high boost for vehicle tables
@@ -157,15 +289,12 @@ def calculate_schema_match_score(schema_name, spec, context):
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
-            # More flexible matching for vehicle tables
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
-            # Also check for partial keyword matches
             elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
                 matches += 0.5  # Partial credit
         if matches > 0:
-            score += (matches / len(labels)) * 40  # Higher weight for row1 tables
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
     # Special handling for Declaration tables (existing logic)
@@ -187,6 +316,16 @@ def calculate_schema_match_score(schema_name, spec, context):
 def match_table_schema(tbl):
     """Improved table schema matching with scoring system"""
     context = get_table_context(tbl)
     best_match = None
     best_score = 0
     for name, spec in TABLE_SCHEMAS.items():
@@ -245,102 +384,256 @@ def extract_multi_schema_table(tbl, schemas):
     return result
 def extract_table_data(tbl, schema_name, spec):
-    """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
-    # 🎯 SPECIAL HANDLING for Vehicle Registration tables
     if "Vehicle Registration" in schema_name:
         print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
         labels = spec["labels"]
-        collected = {lbl: [] for lbl in labels}
-        seen = {lbl: set() for lbl in labels}
-        # For Vehicle Registration, orientation is "row1" - headers in first row
         if len(tbl.rows) < 2:
             print(f"    ❌ Vehicle table has less than 2 rows")
             return {}
-        # Map header cells to labels
         header_row = tbl.rows[0]
         column_mapping = {}
         print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
         for col_idx, cell in enumerate(header_row.cells):
-            header_text = normalize_text(cell.text).strip()
             if not header_text:
                 continue
-            print(f"      Column {col_idx}: '{header_text}'")
-            # Find best matching label
-            best_match = None
-            best_score = 0
-            for label in labels:
-                # Direct match
-                if header_text.upper() == label.upper():
-                    best_match = label
-                    best_score = 1.0
-                    break
-                # Partial keyword matching
-                header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
-                label_words = set(word.upper() for word in label.split() if len(word) > 2)
-                if header_words and label_words:
-                    common_words = header_words.intersection(label_words)
-                    if common_words:
-                        score = len(common_words) / max(len(header_words), len(label_words))
-                        if score > best_score and score >= 0.4:  # Lower threshold for vehicle tables
-                            best_score = score
-                            best_match = label
-            if best_match:
-                column_mapping[col_idx] = best_match
-                print(f"        ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
             else:
-                print(f"        ⚠️ No mapping found for '{header_text}'")
         print(f"    📊 Total column mappings: {len(column_mapping)}")
-        # Extract red text from data rows (skip header)
         for row_idx in range(1, len(tbl.rows)):
             row = tbl.rows[row_idx]
             print(f"      📌 Processing data row {row_idx}")
             for col_idx, cell in enumerate(row.cells):
                 if col_idx in column_mapping:
                     label = column_mapping[col_idx]
-                    # Extract red text
-                    red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
-                    if red_txt:
-                        print(f"        🔴 Found red text in '{label}': '{red_txt}'")
-                        if red_txt not in seen[label]:
-                            seen[label].add(red_txt)
-                            collected[label].append(red_txt)
-        # Return only non-empty collections
         result = {k: v for k, v in collected.items() if v}
         print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
         return result
-    # 🎯 ORIGINAL CODE for all other tables (unchanged)
     labels = spec["labels"] + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
-    by_col = (spec["orientation"] == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
-            red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
             if not red_txt:
                 continue
             if by_col:
                 if ci < len(spec["labels"]):
                     lbl = spec["labels"][ci]
@@ -354,17 +647,19 @@ def extract_table_data(tbl, schema_name, spec):
                         lbl = spec_label
                         break
                 if not lbl:
                     for spec_label in spec["labels"]:
-                        spec_norm = normalize_text(spec_label).upper()
-                        raw_norm = raw_label.upper()
-                        if spec_norm in raw_norm or raw_norm in spec_norm:
                             lbl = spec_label
                             break
                 if not lbl:
                     lbl = schema_name
             if red_txt not in seen[lbl]:
                 seen[lbl].add(red_txt)
                 collected[lbl].append(red_txt)
     return {k: v for k, v in collected.items() if v}
 def extract_red_text(input_doc):
@@ -405,6 +700,8 @@ def extract_red_text(input_doc):
                         out[schema][k] = v
             else:
                 out[schema] = data
     paras = {}
     for idx, para in enumerate(doc.paragraphs):
         red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
@@ -423,8 +720,16 @@ def extract_red_text(input_doc):
         if not context:
             context = "(para)"
         paras.setdefault(context, []).append(red_txt)
     if paras:
         out["paragraphs"] = paras
     return out
 def extract_red_text_filelike(input_file, output_file):

 from docx.oxml.ns import qn
 from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
+def normalize_header_label(s: str) -> str:
+    """Normalize a header/label by stripping parentheticals & punctuation."""
+    s = re.sub(r"\s+", " ", s.strip())
+    # remove content in parentheses/brackets
+    s = re.sub(r"\([^)]*\)", "", s)
+    s = re.sub(r"\[[^]]*\]", "", s)
+    # unify slashes and hyphens, collapse spaces
+    s = s.replace("–", "-").replace("—", "-").replace("/", " / ").replace("  ", " ")
+    return s.strip()
+# Canonical label aliases for Vehicle/Maintenance/General headers
+LABEL_ALIASES = {
+    # Vehicle Registration (Maintenance)
+    "roadworthiness certificates": "Roadworthiness Certificates",
+    "maintenance records": "Maintenance Records",
+    "daily checks": "Daily Checks",
+    "fault recording / reporting": "Fault Recording/ Reporting",
+    "fault repair": "Fault Repair",
+    # Vehicle Registration (Mass)
+    "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance",
+    "weight verification records": "Weight Verification Records",
+    "rfs suspension certification #": "RFS Suspension Certification #",
+    "suspension system maintenance": "Suspension System Maintenance",
+    "trip records": "Trip Records",
+    "fault recording/ reporting on suspension system": "Fault Recording/ Reporting on Suspension System",
+    # Common
+    "registration number": "Registration Number",
+    "no.": "No.",
+    "sub contractor": "Sub contractor",
+    "sub-contractor": "Sub contractor",
+}
+def looks_like_operator_declaration(context):
+    """True iff heading says Operator Declaration and headers include Print Name + Position Title."""
+    heading = (context.get("heading") or "").strip().lower()
+    headers = " ".join(context.get("headers") or []).lower()
+    return (
+        "operator declaration" in heading
+        and "print name" in headers
+        and "position" in headers
+        and "title" in headers
+    )
+def looks_like_auditor_declaration(context):
+    heading = (context.get("heading") or "").strip().lower()
+    headers = " ".join(context.get("headers") or []).lower()
+    return (
+        "auditor declaration" in heading
+        and "print name" in headers
+        and ("nhvr" in headers or "auditor registration number" in headers)
+    )
+# --- NEW: header-only fallback that ignores headings and just keys on the two column names
+def extract_operator_declaration_by_headers_from_end(doc):
+    """
+    Scan tables from the end; if a table's first row contains both
+    'Print Name' AND 'Position Title' (case-insensitive), extract red text
+    from the data rows into:
+        {"Print Name": [...], "Position Title": [...]}
+    """
+    for tbl in reversed(doc.tables):
+        if len(tbl.rows) < 2:
+            continue  # need header + at least one data row
+        headers_norm = [normalize_header_label(c.text).lower() for c in tbl.rows[0].cells]
+        has_print   = any("print name" in h for h in headers_norm)
+        has_pos_tit = any(("position title" in h) or ("position" in h and "title" in h) for h in headers_norm)
+        if not (has_print and has_pos_tit):
+            continue
+        idx_print = next((i for i, h in enumerate(headers_norm) if "print name" in h), None)
+        idx_pos   = next((i for i, h in enumerate(headers_norm) if "position title" in h), None)
+        if idx_pos is None:
+            idx_pos = next((i for i, h in enumerate(headers_norm) if ("position" in h and "title" in h)), None)
+        result = {"Print Name": [], "Position Title": []}
+        for row in tbl.rows[1:]:
+            if idx_print is not None and idx_print < len(row.cells):
+                cell = row.cells[idx_print]
+                reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
+                reds = coalesce_numeric_runs(reds)
+                txt  = normalize_text(" ".join(reds))
+                if txt:
+                    result["Print Name"].append(txt)
+            if idx_pos is not None and idx_pos < len(row.cells):
+                cell = row.cells[idx_pos]
+                reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
+                reds = coalesce_numeric_runs(reds)
+                txt  = normalize_text(" ".join(reds))
+                if txt:
+                    result["Position Title"].append(txt)
+        if result["Print Name"] or result["Position Title"]:
+            return {k: v for k, v in result.items() if v}
+    return None
+# --- end NEW helper
+def canonicalize_label(s: str) -> str:
+    key = normalize_header_label(s).lower()
+    key = re.sub(r"\s+", " ", key)
+    return LABEL_ALIASES.get(key, s)
+def bag_similarity(a: str, b: str) -> float:
+    """Loose bag-of-words similarity for header↔label matching."""
+    aw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(a).lower()) if len(w) > 2 or w in {"#","no"}}
+    bw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(b).lower()) if len(w) > 2 or w in {"#","no"}}
+    if not aw or not bw:
+        return 0.0
+    inter = len(aw & bw)
+    return inter / max(len(aw), len(bw))
+def coalesce_numeric_runs(text_list):
+    """
+    If a cell yields ['4','5','6','9','8','7','1','2','3'] etc., join continuous single-char digit runs.
+    Returns ['456987123'] instead of many singles. Non-digit tokens are preserved.
+    """
+    out, buf = [], []
+    for t in text_list:
+        if len(t) == 1 and t.isdigit():
+            buf.append(t)
+        else:
+            if buf:
+                out.append("".join(buf))
+                buf = []
+            out.append(t)
+    if buf:
+        out.append("".join(buf))
+    return out
 def is_red_font(run):
     """Enhanced red font detection with better color checking"""
     col = run.font.color
     if "Vehicle Registration" in schema_name:
         vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
         if keyword_matches >= 2:
             score += 150  # Very high boost for vehicle tables
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
             elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
                 matches += 0.5  # Partial credit
         if matches > 0:
+            score += (matches / len(labels)) * 40
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
     # Special handling for Declaration tables (existing logic)
 def match_table_schema(tbl):
     """Improved table schema matching with scoring system"""
     context = get_table_context(tbl)
+    # Auditor Declaration first
+    if ("print name" in " ".join(context.get("headers", [])).lower() and
+        "auditor" in " ".join(context.get("headers", [])).lower()):
+        return "NHVAS Approved Auditor Declaration"
+    # NEW: prioritize Auditor Declaration to avoid misclassification
+    if looks_like_auditor_declaration(context):
+        return "NHVAS Approved Auditor Declaration"
+    # hard-match Operator Declaration first (high priority, avoids misclassification)
+    if looks_like_operator_declaration(context):
+        return "Operator Declaration"
     best_match = None
     best_score = 0
     for name, spec in TABLE_SCHEMAS.items():
     return result
 def extract_table_data(tbl, schema_name, spec):
+    """Extract red text data from table based on schema – per-row repeats for specific tables."""
+    # ───────────────────────────────────────────────────────────────────────────
+    # OPERATOR DECLARATION (row1 headers: Print Name | Position Title)
+    # ───────────────────────────────────────────────────────────────────────────
+    if schema_name == "Operator Declaration":
+        print(f"    🧾 EXTRACTION FIX: Processing Operator Declaration table")
+        labels = spec["labels"]  # ["Print Name", "Position Title"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Operator Declaration table has less than 2 rows")
+            return {}
+        # map header cells → labels (row1 orientation)
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # alias/canonical first
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # else bag-of-words similarity
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
+            else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        # collect red text from the (usually single) data row
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                if col_idx not in column_mapping:
+                    continue
+                label = column_mapping[col_idx]
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
+                print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                collected[label].append(red_txt)
+        result = {k: v for k, v in collected.items() if v}
+        print(f"    ✅ Operator Declaration extracted: {len(result)} columns with data")
+        return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # A) Vehicle Registration tables (per-row accumulation; NO dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
     if "Vehicle Registration" in schema_name:
         print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
         labels = spec["labels"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}   # ← keep every row value
+        unmapped_bucket = {}
         if len(tbl.rows) < 2:
             print(f"    ❌ Vehicle table has less than 2 rows")
             return {}
         header_row = tbl.rows[0]
         column_mapping = {}
         print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
         for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
             if not header_text:
                 continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # Try alias/canonical first
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # Else bag-of-words similarity
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
             else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+                unmapped_bucket[raw_h] = []
         print(f"    📊 Total column mappings: {len(column_mapping)}")
+        header_texts = [normalize_text(hc.text) for hc in header_row.cells]
         for row_idx in range(1, len(tbl.rows)):
             row = tbl.rows[row_idx]
             print(f"      📌 Processing data row {row_idx}")
             for col_idx, cell in enumerate(row.cells):
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
                 if col_idx in column_mapping:
                     label = column_mapping[col_idx]
+                    print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                    collected[label].append(red_txt)  # ← append every occurrence
+                else:
+                    header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
+                    unmapped_bucket.setdefault(header_name, []).append(red_txt)
         result = {k: v for k, v in collected.items() if v}
+        if unmapped_bucket:
+            result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
         print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
         return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # B) Driver / Scheduler Records Examined (per-row accumulation; NO dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
+    if "Driver / Scheduler" in schema_name:
+        print(f"    👤 EXTRACTION FIX: Processing Driver / Scheduler table")
+        labels = spec["labels"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}   # ← keep every row value
+        unmapped_bucket = {}
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Driver/Scheduler table has less than 2 rows")
+            return {}
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # Try alias/canonical first (rarely used here, but safe)
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # Else bag-of-words similarity (good for long headings)
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
+            else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+                unmapped_bucket[raw_h] = []
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        header_texts = [normalize_text(hc.text) for hc in header_row.cells]
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
+                if col_idx in column_mapping:
+                    label = column_mapping[col_idx]
+                    print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                    collected[label].append(red_txt)  # ← append every occurrence
+                else:
+                    header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
+                    unmapped_bucket.setdefault(header_name, []).append(red_txt)
+        result = {k: v for k, v in collected.items() if v}
+        if unmapped_bucket:
+            result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
+        print(f"    ✅ Driver / Scheduler extracted: {len(result)} columns with data")
+        return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # C) Generic tables (unchanged: WITH dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
     labels = spec["labels"] + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
+    by_col = (spec.get("orientation") == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
+            reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+            if not reds:
+                continue
+            reds = coalesce_numeric_runs(reds)
+            red_txt = normalize_text(" ".join(reds))
             if not red_txt:
                 continue
             if by_col:
                 if ci < len(spec["labels"]):
                     lbl = spec["labels"][ci]
                         lbl = spec_label
                         break
                 if not lbl:
+                    a_raw = normalize_header_label(raw_label).upper()
                     for spec_label in spec["labels"]:
+                        a_spec = normalize_header_label(spec_label).upper()
+                        if a_spec in a_raw or a_raw in a_spec:
                             lbl = spec_label
                             break
                 if not lbl:
                     lbl = schema_name
             if red_txt not in seen[lbl]:
                 seen[lbl].add(red_txt)
                 collected[lbl].append(red_txt)
     return {k: v for k, v in collected.items() if v}
 def extract_red_text(input_doc):
                         out[schema][k] = v
             else:
                 out[schema] = data
+    # paragraphs (FIX: do not return early; build full 'paras' then attach)
     paras = {}
     for idx, para in enumerate(doc.paragraphs):
         red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
         if not context:
             context = "(para)"
         paras.setdefault(context, []).append(red_txt)
     if paras:
         out["paragraphs"] = paras
+    # Fallback: ensure we capture the last-page Operator Declaration by headers
+    if "Operator Declaration" not in out:
+        op_dec = extract_operator_declaration_by_headers_from_end(doc)
+        if op_dec:
+            out["Operator Declaration"] = op_dec
     return out
 def extract_red_text_filelike(input_file, output_file):

fine_tuning_lightgbm_models.ipynb ADDED Viewed

	@@ -0,0 +1,961 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d15dad13-9732-4e4c-bbd1-1a33545a4293",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f857fc7-d7fb-4b05-a242-de31fb1f086d",
+   "metadata": {},
+   "source": [
+    "In this notebook, we'll go through the process of fine-tuning the LightGBM models in the `pdf-document-layout-analysis` service."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c96b645-eef0-47a2-8c4f-284cdc05e76d",
+   "metadata": {},
+   "source": [
+    "But before doing that, let's start with some basic concepts and introduce modules and methods to make the process easier and cleaner."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1e5c19b-1920-4f2c-9994-943626cd8a58",
+   "metadata": {},
+   "source": [
+    "To begin with, you should first ensure that `Poppler` is installed on your system. We will use it to process PDFs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5f198930-caf1-4cb4-bb1e-8ca063ad8587",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pdftohtml is already installed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "if ! command -v pdftohtml &> /dev/null\n",
+    "then\n",
+    "    echo \"pdftohtml is not installed. Installing now...\"\n",
+    "    sudo apt install pdftohtml\n",
+    "else\n",
+    "    echo \"pdftohtml is already installed.\"\n",
+    "fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d971faa-e9a8-47d6-8c02-66be6f3a3c6c",
+   "metadata": {},
+   "source": [
+    "We use Poppler to convert PDFs to XMLs. To work with Poppler in Python, we have created `PdfFeatures` module, which can be found in `pdf_features/PdfFeatures.py`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f7ac5d42-fb70-4476-8e05-b159f18ae3dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pdf_features.PdfFeatures import PdfFeatures"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e45522eb-6879-472a-a822-64b38041ccc3",
+   "metadata": {},
+   "source": [
+    "To open a PDF file with PdfFeatures module, simply write:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e4ac53e5-b249-4dcd-beeb-e3009e17079b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Page-1\n",
+      "Page-2\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c7c6241-9016-4416-a53e-644145f9063a",
+   "metadata": {},
+   "source": [
+    "When you open `pdf_features` like this, the XML file is saved in a temporary path and handled on the fly.\n",
+    "\n",
+    "If you want to save the XML file, you should provide a path where it can be saved:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "eb1056ee-2e45-4b12-b2bc-8d23553c2143",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Page-1\n",
+      "Page-2\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\", \"test_pdfs/regular.xml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "703ec555-c3a5-4e7e-a6dd-886be67cb6de",
+   "metadata": {},
+   "source": [
+    "Here is a part of the XML to illustrate what it looks like:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b6fcebd-f91b-43fe-b2d6-b9956c3fd173",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
+    "<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n",
+    "\n",
+    "<pdf2xml producer=\"poppler\" version=\"23.04.0\">\n",
+    "<page number=\"1\" position=\"absolute\" top=\"0\" left=\"0\" height=\"842\" width=\"595\">\n",
+    "\t<fontspec id=\"0\" size=\"10\" family=\"JOIBEJ+Verdana\" color=\"#000000\"/>\n",
+    "\t<fontspec id=\"1\" size=\"10\" family=\"JOIBGK+Verdana\" color=\"#000000\"/>\n",
+    "<text top=\"106\" left=\"244\" width=\"111\" height=\"12\" font=\"0\"><b>RESOLUCIÓN DE LA </b></text>\n",
+    "<text top=\"118\" left=\"157\" width=\"284\" height=\"12\" font=\"0\"><b>CORTE INTERAMERICANA DE DERECHOS HUMANOS </b></text>\n",
+    "<text top=\"129\" left=\"227\" width=\"145\" height=\"12\" font=\"0\"><b>DEL 29 DE JULIO DE 1991 </b></text>\n",
+    "<text top=\"141\" left=\"298\" width=\"3\" height=\"12\" font=\"0\"><b> </b></text>\n",
+    "<text top=\"153\" left=\"298\" width=\"3\" height=\"12\" font=\"0\"><b> </b></text>\n",
+    "<text top=\"165\" left=\"132\" width=\"334\" height=\"12\" font=\"0\"><b>MEDIDAS PROVISIONALES SOLICITADAS POR LA COMISIÓN </b></text>\n",
+    "<text top=\"177\" left=\"177\" width=\"245\" height=\"12\" font=\"0\"><b>INTERAMERICANA DE DERECHOS HUMANOS </b></text>\n",
+    "<text top=\"188\" left=\"225\" width=\"149\" height=\"12\" font=\"0\"><b>RESPECTO DE GUATEMALA </b></text>\n",
+    "\n",
+    "...\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4be01120-c4ce-4e09-bc10-64b1742c9b0b",
+   "metadata": {},
+   "source": [
+    "When we convert PDFs to XMLs with Poppler, it creates `tokens`. These tokens are generally lines of text, but they can vary according to Poppler's heuristics and what has been extracted.  \n",
+    "A token can be a single character, empty string, or an entire line. Every `<text>` item you see above is a `token`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00517165-bc84-4a6f-9a8b-91084cc603ab",
+   "metadata": {},
+   "source": [
+    "The PdfFeatures module provides basic capabilities for working with PDF files. Here are some features of this module.  \n",
+    "You don't have to memorize them, but they can be useful for future reference:\n",
+    "\n",
+    "- Every PdfFeatures instance has `pages` attribute. This attribute includes a list of `PdfPage` elements to work with each of the pages.\n",
+    "- Every PdfPage element has attributes like `page_number`, `page_width`, `page_height` and `tokens`.\n",
+    "- The `tokens` attribute includes a list of `PdfToken` elements to work with each of the tokens within that page.\n",
+    "- Every PdfToken element has attributes like `content`, `bounding_box`, `token_type`, `page_number`.\n",
+    "- The `content` attribute is, as the name implies, the string content of the given token.\n",
+    "- The`bounding_box` attribute stores the position of the given token on the page.\n",
+    "- `bounding_box` is a `Rectangle` element. For example, if you want to get the left coordinate of the token, you can do so by typing `token.bounding_box.left`. It will return an integer value.\n",
+    "- `token_type` attribute is for keeping the type of the token. It's a `TokenType` element and you'll see more details about this one in the next sections.\n",
+    "- Like PdfPage items, tokens also have a `page_number` attribute to indicate which page they are on. This is useful in some scenarios."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63a71904-0ad3-4fca-830a-402d9334614a",
+   "metadata": {},
+   "source": [
+    "If you want to loop through the tokens of a file and check their contents you can use something like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "444d3778-c3f5-48fd-aa20-cfe1bf851aad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[96mRESOLUCIÓN DE LA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [244, 106, 355, 118]]\u001B[0m\n",
+      "\u001B[96mCORTE INTERAMERICANA DE DERECHOS HUMANOS\u001B[0m \u001B[93m[Page: 1 || Coordinates: [157, 118, 441, 130]]\u001B[0m\n",
+      "\u001B[96mDEL 29 DE JULIO DE 1991\u001B[0m \u001B[93m[Page: 1 || Coordinates: [227, 129, 372, 141]]\u001B[0m\n",
+      "\u001B[96mMEDIDAS PROVISIONALES SOLICITADAS POR LA COMISIÓN\u001B[0m \u001B[93m[Page: 1 || Coordinates: [132, 165, 466, 177]]\u001B[0m\n",
+      "\u001B[96mINTERAMERICANA DE DERECHOS HUMANOS\u001B[0m \u001B[93m[Page: 1 || Coordinates: [177, 177, 422, 189]]\u001B[0m\n",
+      "\u001B[96mRESPECTO DE GUATEMALA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [225, 188, 374, 200]]\u001B[0m\n",
+      "\u001B[96mCASO CHUNIMA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [254, 224, 344, 236]]\u001B[0m\n",
+      "\u001B[96mLA CORTE INTERAMERICANA DE DERECHOS HUMANOS,\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 259, 393, 271]]\u001B[0m\n",
+      "\u001B[96mVISTOS:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 295, 137, 307]]\u001B[0m\n",
+      "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 318, 101, 330]]\u001B[0m\n",
+      "\u001B[96mLa resolución del Presidente de la Corte Interamericana de Derechos Humanos\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 318, 511, 330]]\u001B[0m\n",
+      "\u001B[96mde 15 de julio de 1991, sobre medidas provisionales solicitadas por la Comisión\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 330, 514, 342]]\u001B[0m\n",
+      "\u001B[96mInteramericana de Derechos Humanos respecto de Guatemala;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 342, 401, 354]]\u001B[0m\n",
+      "\u001B[96m2.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 366, 102, 378]]\u001B[0m\n",
+      "\u001B[96mLa convocatoria a una audiencia pública para el día 29 de julio de 1991 a las\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 366, 512, 378]]\u001B[0m\n",
+      "\u001B[96m3:00 p.m., contenida en la resolución citada;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 378, 312, 390]]\u001B[0m\n",
+      "\u001B[96m3.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 401, 104, 413]]\u001B[0m\n",
+      "\u001B[96mLos escritos de fechas 24 y 26 de este mes de julio presentados por el\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 401, 514, 413]]\u001B[0m\n",
+      "\u001B[96mGobierno de Guatemala en los cuales informa que, en atención a la resolución del\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 413, 513, 425]]\u001B[0m\n",
+      "\u001B[96mPresidente, ha tomado medidas dirigidas a la protección de las personas\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 425, 518, 437]]\u001B[0m\n",
+      "\u001B[96mmencionadas en esa resolución y solicita un aplazamiento de por lo menos 30 días de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 437, 512, 449]]\u001B[0m\n",
+      "\u001B[96mla audiencia convocada por el Presidente para hoy, a fin de contar con un plazo que\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 448, 512, 460]]\u001B[0m\n",
+      "\u001B[96mle permita hacer una presentación adecuada ante la Corte.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 460, 380, 472]]\u001B[0m\n",
+      "\u001B[96mCONSIDERANDO:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 484, 189, 496]]\u001B[0m\n",
+      "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 508, 101, 520]]\u001B[0m\n",
+      "\u001B[96mQue, en virtud del artículo 23.4 de su Reglamento, la Corte Interamericana de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 508, 511, 520]]\u001B[0m\n",
+      "\u001B[96mDerechos Humanos debe pronunciarse sobre la resolución del Presidente del 15 de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 519, 513, 531]]\u001B[0m\n",
+      "\u001B[96mjulio de 1991;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 531, 160, 543]]\u001B[0m\n",
+      "\u001B[96m2.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 555, 104, 567]]\u001B[0m\n",
+      "\u001B[96mQue, habida cuenta de que la Corte se encuentra reunida, debe también\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 555, 514, 567]]\u001B[0m\n",
+      "\u001B[96mdecidir sobre la petición de aplazamiento de la audiencia sobre medidas provisionales\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 567, 512, 579]]\u001B[0m\n",
+      "\u001B[96mformuladas por el Gobierno de Guatemala.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 578, 300, 590]]\u001B[0m\n",
+      "\u001B[96mPOR TANTO:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 602, 159, 614]]\u001B[0m\n",
+      "\u001B[96mLA CORTE INTERAMERICANA DE DERECHOS HUMANOS,\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 626, 393, 638]]\u001B[0m\n",
+      "\u001B[96mRESUELVE:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 649, 151, 661]]\u001B[0m\n",
+      "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 673, 103, 685]]\u001B[0m\n",
+      "\u001B[96mConvocar a una audiencia pública para el 30 de julio de 1991 a las 15:00\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 673, 513, 685]]\u001B[0m\n",
+      "\u001B[96mhoras con el objeto de conocer los puntos de vista del Gobierno de Guatemala y de la\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 685, 512, 697]]\u001B[0m\n",
+      "\u001B[96mComisión sobre la solicitud de prórroga formulada por el primero.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 697, 412, 709]]\u001B[0m\n",
+      "\u001B[96m2\u001B[0m \u001B[93m[Page: 2 || Coordinates: [294, 71, 300, 83]]\u001B[0m\n",
+      "\u001B[96m2.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 106, 101, 118]]\u001B[0m\n",
+      "\u001B[96mConocer también, en dicha audiencia pública, de las medidas que, en atención\u001B[0m \u001B[93m[Page: 2 || Coordinates: [122, 106, 511, 118]]\u001B[0m\n",
+      "\u001B[96ma la resolución del Presidente del 15 de julio del presente año, ha tomado el\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 118, 515, 130]]\u001B[0m\n",
+      "\u001B[96mGobierno de Guatemala.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 129, 211, 141]]\u001B[0m\n",
+      "\u001B[96m3.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 153, 103, 165]]\u001B[0m\n",
+      "\u001B[96mReservarse el derecho de convocar a una audiencia pública para resolver la\u001B[0m \u001B[93m[Page: 2 || Coordinates: [122, 153, 513, 165]]\u001B[0m\n",
+      "\u001B[96mpetición de la Comisión sobre medidas provisionales respecto de Guatemala.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 165, 467, 177]]\u001B[0m\n",
+      "\u001B[96mHéctor Fix-Zamudio\u001B[0m \u001B[93m[Page: 2 || Coordinates: [249, 200, 349, 212]]\u001B[0m\n",
+      "\u001B[96mPresidente\u001B[0m \u001B[93m[Page: 2 || Coordinates: [272, 212, 327, 224]]\u001B[0m\n",
+      "\u001B[96mOrlando\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 248, 161, 260]]\u001B[0m\n",
+      "\u001B[96mTovar\u001B[0m \u001B[93m[Page: 2 || Coordinates: [129, 248, 191, 260]]\u001B[0m\n",
+      "\u001B[96mTamayo\u001B[0m \u001B[93m[Page: 2 || Coordinates: [161, 248, 234, 260]]\u001B[0m\n",
+      "\u001B[96mThomas\u001B[0m \u001B[93m[Page: 2 || Coordinates: [225, 248, 436, 260]]\u001B[0m\n",
+      "\u001B[96mBuergenthal\u001B[0m \u001B[93m[Page: 2 || Coordinates: [405, 248, 499, 260]]\u001B[0m\n",
+      "\u001B[96mRafael Nieto Navia\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 283, 195, 295]]\u001B[0m\n",
+      "\u001B[96mPolicarpo Callejas Bonilla\u001B[0m \u001B[93m[Page: 2 || Coordinates: [329, 283, 481, 295]]\u001B[0m\n",
+      "\u001B[96mSonia\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 318, 150, 330]]\u001B[0m\n",
+      "\u001B[96mPicado\u001B[0m \u001B[93m[Page: 2 || Coordinates: [118, 318, 184, 330]]\u001B[0m\n",
+      "\u001B[96mSotela\u001B[0m \u001B[93m[Page: 2 || Coordinates: [153, 318, 218, 330]]\u001B[0m\n",
+      "\u001B[96mJulio\u001B[0m \u001B[93m[Page: 2 || Coordinates: [191, 318, 419, 330]]\u001B[0m\n",
+      "\u001B[96mA.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [388, 318, 433, 330]]\u001B[0m\n",
+      "\u001B[96mBarberis\u001B[0m \u001B[93m[Page: 2 || Coordinates: [402, 318, 477, 330]]\u001B[0m\n",
+      "\u001B[96mManuel E. Ventura Robles\u001B[0m \u001B[93m[Page: 2 || Coordinates: [235, 354, 364, 366]]\u001B[0m\n",
+      "\u001B[96mSecretario\u001B[0m \u001B[93m[Page: 2 || Coordinates: [273, 366, 326, 378]]\u001B[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "for page in pdf_features.pages:\n",
+    "    for token in page.tokens:\n",
+    "        coordinates = [token.bounding_box.left, token.bounding_box.top, token.bounding_box.right, token.bounding_box.bottom]\n",
+    "        print(f\"\\033[96m{token.content}\\033[0m \\033[93m[Page: {page.page_number} || Coordinates: {coordinates}]\\033[0m\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4576ff4d-92fc-4e19-a947-ebfb3fd01060",
+   "metadata": {},
+   "source": [
+    "## Fine-Tuning Models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01826a89-25c9-4385-a1e6-b65c0edbd0c6",
+   "metadata": {},
+   "source": [
+    "Now that we have some overview about the `PdfFeatures` module, we can now start fine-tuning process."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "586eba43-9138-4eff-a3fa-24553de04e82",
+   "metadata": {},
+   "source": [
+    "In the `pdf-document-layout-analysis` service, there are two LightGBM (i.e. fast) models.\n",
+    "\n",
+    "- The first model is used to determine the types of tokens. We call it `token_type_model`.\n",
+    "- The second model is used to identify the segments to which the tokens belong. We call this model `paragraph_extraction_model`.\n",
+    "\n",
+    "The second model uses the predictions from the first model's output (predicted token types) as part of its features. So, let's start by fine-tuning the token type model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c326ccb1-a36b-40f9-b7e2-e83ba3c0e12b",
+   "metadata": {},
+   "source": [
+    "### Fine-Tuning Token Type Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b3638eb-c512-4bd5-97f4-4df3ae984978",
+   "metadata": {},
+   "source": [
+    "#### Loading Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab35b27c-8464-470c-9ef1-a9aef8945f6a",
+   "metadata": {},
+   "source": [
+    "To properly train a token type model, you should have a list of PdfFeatures items where the `token_type` attribute of their tokens is set correctly, as this attribute will be used as the label.\n",
+    "\n",
+    "To see what labels are going to be used in the model, you can check `pdf_token_type_labels/TokenType.py`. As default, we are using the labels of [DocLayNet](https://github.com/DS4SD/DocLayNet) dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2ab3093c-6e67-4505-bac3-b7db73ef5372",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_pdf_features_labels() -> PdfFeatures:\n",
+    "    # Assuming that you are loading your own labels in this part.\n",
+    "    # I'm just going to put a list with a single file for demonstration.\n",
+    "    pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
+    "    labeled_pdf_features_list: list[PdfFeatures] = [pdf_features]\n",
+    "    return labeled_pdf_features_list\n",
+    "\n",
+    "def train_token_type_model():\n",
+    "    model_configuration = ModelConfiguration()\n",
+    "    labeled_pdf_features_list: list[PdfFeatures] = get_pdf_features_labels()\n",
+    "    trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
+    "    train_labels = [token.token_type.get_index() for token in trainer.loop_tokens()]\n",
+    "    trainer.train(\"models/token_type_example_model.model\", train_labels)    \n",
+    "\n",
+    "train_token_type_model()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32db8aee-9d2c-45bf-b7af-ac6249081f32",
+   "metadata": {},
+   "source": "Don't forget to check what's inside the `model_configuration`. You might need to tune the hyperparameters."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fda0c166-ac25-4084-974a-c73f1cb06f18",
+   "metadata": {},
+   "source": "If you want to use our trained models as base and refit with your own data, you can use this function:"
+  },
+  {
+   "cell_type": "code",
+   "id": "5acf2beb-f7a2-4e12-8f11-4bffff7efa74",
+   "metadata": {},
+   "source": [
+    "def refit_token_type_model():\n",
+    "    model_configuration = ModelConfiguration()\n",
+    "    model_configuration.resume_training = True\n",
+    "    labeled_pdf_features_list: list[PdfFeatures] = get_pdf_features_labels()\n",
+    "    trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
+    "    train_labels = [token.token_type.get_index() for token in trainer.loop_tokens()]\n",
+    "    trainer.train(\"models/token_type_lightgbm.model\", train_labels)\n"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c50cbae-9841-4289-9097-7357a0c724a7",
+   "metadata": {},
+   "source": "Running this function will refit the same model with your data. Depending on your situation, it may or may not help you."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19abde59-7ba5-4e65-8ce7-6bb7fb2202d5",
+   "metadata": {},
+   "source": [
+    "If it does not help, you can try to check other fine-tuning strategies in LightGBM. \n",
+    "\n",
+    "In that case, all you need to do is changing this part in `pdf_tokens_type_trainer/PdfTrainer.py` (lines 47-49):\n",
+    "\n",
+    "```\n",
+    "        if self.model_configuration.resume_training and exists(model_path):\n",
+    "            model = lgb.Booster(model_file=model_path)\n",
+    "            gbm = model.refit(x_train, labels)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5379e82a-9fa7-4fea-9d6b-a11e672707bc",
+   "metadata": {},
+   "source": "To make predictions with the trained model, you can use this function:"
+  },
+  {
+   "cell_type": "code",
+   "id": "f5b7f4fb-7052-4e8c-856a-6b1d83e5ece4",
+   "metadata": {},
+   "source": [
+    "def get_predictions():\n",
+    "    model_configuration = ModelConfiguration()\n",
+    "    pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
+    "    trainer = TokenTypeTrainer([pdf_features], model_configuration)\n",
+    "    trainer.set_token_types()\n",
+    "    for token in pdf_features.pages[0].tokens[:20]:\n",
+    "        print(f\"\\033[96m{token.content}\\033[0m \\033[93m[{token.token_type}]\\033[0m\")\n",
+    "\n",
+    "get_predictions()    "
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6808202-892d-43e0-9e7a-73ebc347901f",
+   "metadata": {},
+   "source": "### Fine-Tuning Paragraph Extraction Model"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b31a859-7867-4bd0-be13-7ae4ff4c8a61",
+   "metadata": {},
+   "source": "#### Loading Data"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2778fd0b-5351-4c83-a15a-ecf8aac91397",
+   "metadata": {},
+   "source": "The second model in our pipeline is the paragraph extraction model. After finding the type of each token, now, we are going to \"group\" the tokens, which means, we are going to find each token's segment."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8112645a-816d-4579-b6e9-14b505703fc9",
+   "metadata": {},
+   "source": "We are going to explain the process but for this part, we highly recommend you to place your labeled data as in this following file structure and use the already existing methods. Otherwise, it can be a bit more harder for you to use our modules:"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b96c7988-cd1e-492a-9990-84db9f7111d2",
+   "metadata": {},
+   "source": [
+    "```\n",
+    ".\n",
+    "└── pdf-labeled-data\n",
+    "    ├── labeled_data\n",
+    "    │   ├── token_type\n",
+    "    │   │   ├── train_data\n",
+    "    │   │   │   ├── example_document1\n",
+    "    │   │   │   │   └── labels.json\n",
+    "    │   │   │   ├── example_document2\n",
+    "    │   │   │   │   └── labels.json\n",
+    "    │   │   │   └── example_document3\n",
+    "    │   │   │       └── labels.json\n",
+    "    │   │   └── test_data\n",
+    "    │   │       └── example_document4\n",
+    "    │   │           └── labels.json\n",
+    "    │   └── paragraph_extraction\n",
+    "    │       ├── train_data\n",
+    "    │       │   ├── example_document1\n",
+    "    │       │   │   └── labels.json\n",
+    "    │       │   ├── example_document2\n",
+    "    │       │   │   └── labels.json\n",
+    "    │       │   └── example_document3\n",
+    "    │       │       └── labels.json\n",
+    "    │       └── test_data\n",
+    "    │           └── example_document4\n",
+    "    │               └── labels.json\n",
+    "    └── pdfs\n",
+    "        ├── example_document1\n",
+    "        │   ├── document.pdf\n",
+    "        │   └── etree.xml\n",
+    "        ├── example_document2\n",
+    "        │   ├── document.pdf\n",
+    "        │   └── etree.xml\n",
+    "        ├── example_document3\n",
+    "        │   ├── document.pdf\n",
+    "        │   └── etree.xml\n",
+    "        └── example_document4\n",
+    "            ├── document.pdf\n",
+    "            └── etree.xml\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c40e426-af77-47fc-a82c-77b5ca4fddeb",
+   "metadata": {},
+   "source": [
+    "Some details about this structure:\n",
+    "\n",
+    "- Every detail in the token type labels file structure applies for this structure too.\n",
+    "- `paragraph_extraction` directory is where your paragraph extraction datasets are located, its name should not be something else.\n",
+    "- `token_type` labels are also shown in the structure because token types are used as a feature in the paragraph extraction model. If you do not have it, it will not break the pipeline and still train the model but the token_type feature for every token will be `TokenType.TEXT` in paragraph extractor model's features.\n",
+    "- If you do not have `token_type` labels, another option is, after loading the data, you can predict the token types with the token type model (will be shown below)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e234e69-7e50-4ffe-a31b-2dc8248a676f",
+   "metadata": {},
+   "source": "For labels.json files, they should have this structure:"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "472072a6-a02c-4b75-bbc0-f13bb7e357d2",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "{\n",
+    "    \"pages\": [\n",
+    "        {\n",
+    "            \"number\": 1,\n",
+    "            \"labels\": [\n",
+    "                {\n",
+    "                    \"top\": 86,\n",
+    "                    \"left\": 162,\n",
+    "                    \"width\": 292,\n",
+    "                    \"height\": 24,\n",
+    "                    \"label_type\": 0\n",
+    "                },\n",
+    "                {\n",
+    "                    \"top\": 122,\n",
+    "                    \"left\": 221,\n",
+    "                    \"width\": 174,\n",
+    "                    \"height\": 12,\n",
+    "                    \"label_type\": 0\n",
+    "                }\n",
+    "            ]\n",
+    "        },\n",
+    "        {\n",
+    "            \"number\": 2,\n",
+    "            \"labels\": [\n",
+    "                {\n",
+    "                    \"top\": 36,\n",
+    "                    \"left\": 296,\n",
+    "                    \"width\": 22,\n",
+    "                    \"height\": 13,\n",
+    "                    \"label_type\": 0\n",
+    "                },\n",
+    "                {\n",
+    "                    \"top\": 72,\n",
+    "                    \"left\": 71,\n",
+    "                    \"width\": 473,\n",
+    "                    \"height\": 49,\n",
+    "                    \"label_type\": 0\n",
+    "                }\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb6e716b-b742-4186-9e1a-ac5ecea708ac",
+   "metadata": {},
+   "source": [
+    "Here you see a list of labels for each page. Each label includes information about the coordinates `top`, `left`, `width`, `height` for each segment/paragraph. So, this time the coordinates belongs to the segments, not to the tokens.\n",
+    "\n",
+    "As \"label_type\", it should be always 0 since there is only one type \"paragraph\" (don't get confused with this part, it's not important, just put 0 and go on).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2c8a9b3-6180-41f2-bb82-bea892a61f5e",
+   "metadata": {},
+   "source": "Using this information, you can load your data like this:"
+  },
+  {
+   "cell_type": "code",
+   "id": "cb6ae549-4f52-45b0-853a-6414ca8b4af3",
+   "metadata": {},
+   "source": [
+    "from os.path import join\n",
+    "from paragraph_extraction_trainer.PdfParagraphTokens import PdfParagraphTokens\n",
+    "\n",
+    "\n",
+    "def load_paragraph_extraction_labels():\n",
+    "\t\n",
+    "\tpdf_labeled_data_root_path = \"path/to/pdf/labeled/data\"\n",
+    "\tdatasets_path = join(pdf_labeled_data_root_path, \"paragraph_extraction\")\n",
+    "\tlabeled_data: list[PdfParagraphTokens] = []\n",
+    "\t\n",
+    "\tfor dataset in listdir(join(datasets_path)):\n",
+    "\t\tif \"train\" not in dataset:\n",
+    "\t\t\tcontinue\n",
+    "\t\tpdf_paragraph_tokens: PdfParagraphTokens = PdfParagraphTokens.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name)\n",
+    "\t\tlabeled_data.append(pdf_paragraph_tokens)\n",
+    "\t\n",
+    "\treturn labeled_data\n",
+    "\n",
+    "\n",
+    "from adapters.ml.pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer\n",
+    "\n",
+    "def load_paragraph_extraction_labels():\n",
+    "\n",
+    "    pdf_labeled_data_root_path = \"path/to/pdf/labeled/data\"\n",
+    "    datasets_path = join(pdf_labeled_data_root_path, \"paragraph_extraction\")\n",
+    "    labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = []\n",
+    "    \n",
+    "    for dataset in listdir(join(datasets_path)):\n",
+    "        if \"train\" not in dataset:\n",
+    "            continue\n",
+    "        pdf_paragraph_tokens: PdfParagraphTokens = PdfParagraphTokens.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name)\n",
+    "        labeled_pdf_paragraph_tokens_list.append(pdf_paragraph_tokens)\n",
+    "    \n",
+    "    \n",
+    "        token_type_model_configuration = ModelConfiguration()\n",
+    "        labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
+    "    trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
+    "    \n",
+    "    \n",
+    "    return labeled_pdf_paragraph_tokens_list"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf3f6a6c-cba7-43c4-9f72-85cbe447cb6e",
+   "metadata": {},
+   "source": "#### Fine-Tuning the Model"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29dbaba4-d3d6-4985-be44-df872fe9b5d4",
+   "metadata": {},
+   "source": "Again, to be able to use our trained paragraph extraction model, you should download it from our huggingface repo. You can just run `download_models.py` and both models are going to be downloaded."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a82f6f6-cec9-48bc-9c64-b09aa65d2754",
+   "metadata": {},
+   "source": [
+    "If you want to download it manually, you can use this link: https://huggingface.co/HURIDOCS/pdf-document-layout-analysis/tree/main\n",
+    "\n",
+    "After downloading it, place it into `models` directory. The path should be as follows:  \n",
+    "`~/pdf-document-layout-analysis/models/paragraph_extraction_lightgbm.model`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b95cd2cd-0d41-4518-8576-b1a0d2adc21b",
+   "metadata": {},
+   "source": "To train the paragraph extraction model from scratch:"
+  },
+  {
+   "cell_type": "code",
+   "id": "67948603-80e6-4b42-9ba1-78868fd9f946",
+   "metadata": {},
+   "source": [
+    "from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION\n",
+    "\n",
+    "\n",
+    "def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens]):\n",
+    "    for pdf_paragraph_tokens in pdf_paragraph_tokens_list:\n",
+    "        for page in pdf_paragraph_tokens.pdf_features.pages:\n",
+    "            if not page.tokens:\n",
+    "                continue\n",
+    "            for token, next_token in zip(page.tokens, page.tokens[1:]):\n",
+    "                yield pdf_paragraph_tokens, token, next_token\n",
+    "            yield pdf_paragraph_tokens, page.tokens[-1], page.tokens[-1]\n",
+    "\n",
+    "\n",
+    "def train_paragraph_extraction_model():\n",
+    "    labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
+    "    labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
+    "    trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
+    "    \n",
+    "    train_labels = []\n",
+    "    for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
+    "        train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
+    "\n",
+    "    trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels)    "
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e7cd129-874e-415d-9855-401d8c5d0136",
+   "metadata": {},
+   "source": "And to refit the model with your own data, all you need to do is setting `resume_training` configuration to `True`:"
+  },
+  {
+   "cell_type": "code",
+   "id": "37b6b980-deaf-4ba4-baf0-7bf137af63a7",
+   "metadata": {},
+   "source": [
+    "def refit_paragraph_extraction_model():\n",
+    "    labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
+    "    labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
+    "    MODEL_CONFIGURATION.resume_training = True\n",
+    "    trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
+    "    \n",
+    "    train_labels = []\n",
+    "    for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
+    "        train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
+    "\n",
+    "    trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels)    "
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1389cf49-c163-4f90-ab0c-9606756b8ef9",
+   "metadata": {},
+   "source": "<font color='red'>[IMPORTANT]</font> If you want to use your own trained models in pdf-document-layout-analysis service, make sure their names are `token_type_lightgbm.model` and `paragraph_extraction_lightgbm.model` and are placed in `models` directory."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1d4cf8c-65d2-4496-adcf-ab73acc5000f",
+   "metadata": {},
+   "source": "After finishing training, you can get the predictions of the model like shown in below:"
+  },
+  {
+   "cell_type": "code",
+   "id": "69e747aa-9b19-4e8d-acbb-f8d221dfe006",
+   "metadata": {},
+   "source": [
+    "from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration\n",
+    "from fast_trainer.model_configuration import MODEL_CONFIGURATION as PARAGRAPH_EXTRACTION_CONFIGURATION\n",
+    "from domain.PdfSegment import PdfSegment\n",
+    "from adapters.ml.fast_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer\n",
+    "\n",
+    "def get_predictions():\n",
+    "    pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
+    "    # First, use token type model to find and set the types.\n",
+    "    token_type_trainer = TokenTypeTrainer([pdf_features], ModelConfiguration())\n",
+    "    token_type_trainer.set_token_types(\"models/token_type_lightgbm.model\")\n",
+    "    trainer = ParagraphExtractorTrainer(pdfs_features=[pdf_features], model_configuration=PARAGRAPH_EXTRACTION_CONFIGURATION)\n",
+    "    segments: list[PdfSegment] = trainer.get_pdf_segments(\"models/paragraph_extraction_lightgbm.model\")\n",
+    "    model_configuration = ModelConfiguration()\n",
+    "    for segment in segments[:20]:\n",
+    "        print(f\"\\033[96m{segment.text_content}\\033[0m \\033[93m[{segment.segment_type}]\\033[0m \\033[91m{segment.bounding_box.to_dict()}\\033[0m\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e3af70a1-404e-4bac-a366-f7962636b1eb",
+   "metadata": {},
+   "source": "Output of the `paragraph_extraction_model` is a list of `PdfSegment` items. Every item includes the information like `page_number`, `text_content`, `segment_type`, `bounding_box`, `pdf_name` for each of the segments. "
+  },
+  {
+   "cell_type": "code",
+   "id": "4dc0c106-7b22-42e3-969f-d52ecddae3ae",
+   "metadata": {},
+   "source": "",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d5b2376-d983-4c49-8130-b94368782828",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "{\n",
+    "    \"pages\": [\n",
+    "        {\n",
+    "            \"number\": 1,\n",
+    "            \"labels\": [\n",
+    "                {\n",
+    "                    \"top\": 86,\n",
+    "                    \"left\": 162,\n",
+    "                    \"width\": 292,\n",
+    "                    \"height\": 24,\n",
+    "                    \"label_type\": 0\n",
+    "                },\n",
+    "                {\n",
+    "                    \"top\": 122,\n",
+    "                    \"left\": 221,\n",
+    "                    \"width\": 174,\n",
+    "                    \"height\": 12,\n",
+    "                    \"label_type\": 0\n",
+    "                }\n",
+    "            ]\n",
+    "        },\n",
+    "        {\n",
+    "            \"number\": 2,\n",
+    "            \"labels\": [\n",
+    "                {\n",
+    "                    \"top\": 36,\n",
+    "                    \"left\": 296,\n",
+    "                    \"width\": 22,\n",
+    "                    \"height\": 13,\n",
+    "                    \"label_type\": 0\n",
+    "                },\n",
+    "                {\n",
+    "                    \"top\": 72,\n",
+    "                    \"left\": 71,\n",
+    "                    \"width\": 473,\n",
+    "                    \"height\": 49,\n",
+    "                    \"label_type\": 0\n",
+    "                }\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1972189b-c70b-436d-9830-56adc354b777",
+   "metadata": {},
+   "source": [
+    "Using this information, you can load your data like this:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6c07ba4-334e-4ff3-8e2f-b2f684f053c9",
+   "metadata": {},
+   "source": [
+    "In case you do not have `token_type` labels and want to find the types with the `token_type_model`, you can use this:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41b6bb64-92a2-4b75-95f9-a934c104b7c0",
+   "metadata": {},
+   "source": [
+    "#### Fine-Tuning the Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd38ced0-2925-4fe5-98ec-b633a19b5ce3",
+   "metadata": {},
+   "source": [
+    "If you want to download it manually, you can use this link: https://huggingface.co/HURIDOCS/pdf-document-layout-analysis/tree/main\n",
+    "\n",
+    "After downloading it, place it into `models` directory. The path should be as follows:  \n",
+    "`~/pdf-document-layout-analysis/models/paragraph_extraction_lightgbm.model`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "60b22be7-35d0-4c34-891e-c67d25942c72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION\n",
+    "\n",
+    "\n",
+    "def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens]):\n",
+    "    for pdf_paragraph_tokens in pdf_paragraph_tokens_list:\n",
+    "        for page in pdf_paragraph_tokens.pdf_features.pages:\n",
+    "            if not page.tokens:\n",
+    "                continue\n",
+    "            for token, next_token in zip(page.tokens, page.tokens[1:]):\n",
+    "                yield pdf_paragraph_tokens, token, next_token\n",
+    "            yield pdf_paragraph_tokens, page.tokens[-1], page.tokens[-1]\n",
+    "\n",
+    "\n",
+    "def train_paragraph_extraction_model():\n",
+    "    labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
+    "    labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
+    "    trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
+    "    \n",
+    "    train_labels = []\n",
+    "    for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
+    "        train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
+    "\n",
+    "    trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels)    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "5a652ca1-b9c7-4731-ba8b-aa98cd0d11a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def refit_paragraph_extraction_model():\n",
+    "    labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
+    "    labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
+    "    MODEL_CONFIGURATION.resume_training = True\n",
+    "    trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
+    "    \n",
+    "    train_labels = []\n",
+    "    for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
+    "        train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
+    "\n",
+    "    trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels)    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ca5d8ef-7455-4723-af4e-d8c49096251f",
+   "metadata": {},
+   "source": [
+    "After finishing training, you can get the predictions of the model like shown in below:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5a5ab63-7931-40e0-8f51-43e3f3ef5b32",
+   "metadata": {},
+   "source": [
+    "Output of the `paragraph_extraction_model` is a list of `PdfSegment` items. Every item includes the information like `page_number`, `text_content`, `segment_type`, `bounding_box`, `pdf_name` for each of the segments. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

images/vgtexample1.png ADDED Viewed

Git LFS Details

SHA256: 4b68017bb1ff60317bc2575db44db7117a245321e2baa34efd24b115748a38ca
Pointer size: 131 Bytes
Size of remote file: 240 kB

images/vgtexample2.png ADDED Viewed

Git LFS Details

SHA256: eb2bbb4a4ae5351cf7829b0ba217b21248fd0b92e510e3578c0130952b7573a1
Pointer size: 131 Bytes
Size of remote file: 256 kB

images/vgtexample3.png ADDED Viewed

Git LFS Details

SHA256: fae87bba8266250d03815b183f4c5ef3e839998bb9dcd187b99ea87e99384ff1
Pointer size: 131 Bytes
Size of remote file: 127 kB

images/vgtexample4.png ADDED Viewed

Git LFS Details

SHA256: 1a7c9a4fe0d53c57cca52b56a1de98988b9d2ec0a7be25109d120e20f87fa118
Pointer size: 131 Bytes
Size of remote file: 213 kB

justfile ADDED Viewed

	@@ -0,0 +1,95 @@

+HAS_GPU := `command -v nvidia-smi > /dev/null && echo 1 || echo 0`
+install:
+	. .venv/bin/activate; pip install -Ur requirements.txt
+activate:
+	. .venv/bin/activate
+install_venv:
+	python3 -m venv .venv
+	. .venv/bin/activate; python -m pip install --upgrade pip
+	. .venv/bin/activate; python -m pip install -r dev-requirements.txt
+formatter:
+	. .venv/bin/activate; command black --line-length 125 .
+check_format:
+	. .venv/bin/activate; command black --line-length 125 . --check
+remove_docker_containers:
+	docker compose ps -q | xargs docker rm
+remove_docker_images:
+	docker compose config --images | xargs docker rmi
+start:
+	mkdir -p ./models
+	if [ {{HAS_GPU}} -eq 1 ]; then \
+		echo "NVIDIA GPU detected, using docker-compose-gpu.yml"; \
+		docker compose -f docker-compose-gpu.yml up --build; \
+	else \
+		echo "No NVIDIA GPU detected, using docker-compose.yml"; \
+		docker compose -f docker-compose.yml up --build; \
+	fi
+start_no_gpu:
+	mkdir -p ./models
+	docker compose up --build
+stop:
+	docker compose stop
+test:
+	. .venv/bin/activate; command cd src; command python -m pytest
+free_up_space:
+	df -h
+	sudo rm -rf /usr/share/dotnet
+	sudo rm -rf /opt/ghc
+	sudo rm -rf "/usr/local/share/boost"
+	sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+	sudo apt-get remove -y '^llvm-.*' || true
+	sudo apt-get remove -y 'php.*' || true
+	sudo apt-get remove -y google-cloud-sdk hhvm google-chrome-stable firefox mono-devel || true
+	sudo apt-get autoremove -y
+	sudo apt-get clean
+	sudo rm -rf /usr/share/dotnet
+	sudo rm -rf /usr/local/lib/android
+	sudo rm -rf /opt/hostedtoolcache/CodeQL
+	sudo docker image prune --all --force
+	df -h
+start_detached:
+	mkdir -p ./models
+	docker compose up --build -d
+start_detached_gpu:
+	mkdir -p ./models
+	RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
+upgrade:
+	. .venv/bin/activate; pip-upgrade
+tag:
+	#!/bin/bash
+	# Get current date
+	CURRENT_DATE=$(date +%Y.%-m.%-d)
+	echo "Current date: $CURRENT_DATE"
+	# Get the latest tag that matches today's date pattern
+	LATEST_TAG=$(git tag --list "${CURRENT_DATE}.*" --sort=-version:refname | head -n1)
+	if [ -z "$LATEST_TAG" ]; then
+		# No tag for today, start with revision 1
+		REVISION=1
+	else
+		# Extract revision number and increment
+		REVISION=$(echo $LATEST_TAG | cut -d. -f4)
+		REVISION=$((REVISION + 1))
+	fi
+	NEW_TAG="${CURRENT_DATE}.${REVISION}"
+	echo "Creating new tag: $NEW_TAG"
+	git tag $NEW_TAG
+	git push --tag

master_key.py CHANGED Viewed

@@ -305,6 +305,7 @@ TABLE_SCHEMAS = {
         "orientation": "row1",
         "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
         "priority": 90,
         "context_exclusions": ["manager", "operator declaration"]
     },
     "Audit Declaration dates": {
@@ -368,4 +369,4 @@ PARAGRAPH_PATTERNS = {
     "declaration_text": r"I hereby acknowledge and agree with the findings.*",
     "introductory_note": r"This audit assesses the.*",
     "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
-}

         "orientation": "row1",
         "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
         "priority": 90,
+        "context_keywords": ["auditor declaration", "NHVR"],
         "context_exclusions": ["manager", "operator declaration"]
     },
     "Audit Declaration dates": {
     "declaration_text": r"I hereby acknowledge and agree with the findings.*",
     "introductory_note": r"This audit assesses the.*",
     "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[project]
+name = "pdf-document-layout-analysis"
+version = "2025.03.18.03"
+description = "This tool is for PDF document layout analysis"
+license = { file = "LICENSE" }
+authors = [{ name = "HURIDOCS" }]
+requires-python = ">= 3.10"
+dependencies = [
+    "fastapi==0.111.1",
+    "python-multipart==0.0.9",
+    "uvicorn==0.30.3",
+    "gunicorn==22.0.0",
+    "requests==2.32.3",
+    "torch==2.4.0",
+    "torchvision==0.19.0",
+    "timm==1.0.8",
+    "Pillow==10.4.0",
+    "pdf-annotate==0.12.0",
+    "scipy==1.14.0",
+    "opencv-python==4.10.0.84",
+    "Shapely==2.0.5",
+    "transformers==4.40.2",
+    "huggingface_hub==0.23.5",
+    "pdf2image==1.17.0",
+    "lxml==5.2.2",
+    "lightgbm==4.5.0",
+    "setuptools==75.4.0",
+    "roman==4.2",
+    "hydra-core==1.3.2",
+    "pypandoc==1.13",
+    "rapid-latex-ocr==0.0.9",
+    "struct_eqtable @ git+https://github.com/UniModal4Reasoning/StructEqTable-Deploy.git@fd06078bfa9364849eb39330c075dd63cbed73ff"
+]
+[project.urls]
+HURIDOCS = "https://huridocs.org"
+GitHub = "https://github.com/huridocs/pdf-document-layout-analysis"
+HuggingFace = "https://huggingface.co/HURIDOCS/pdf-document-layout-analysis"
+DockerHub = "https://hub.docker.com/r/huridocs/pdf-document-layout-analysis"

requirements.txt CHANGED Viewed

@@ -1,3 +1,30 @@
 gradio==4.44.1
 pytesseract
 python-docx

+fastapi==0.111.1
+pydantic==2.11.0
+python-multipart==0.0.9
+uvicorn==0.30.3
+gunicorn==22.0.0
+requests==2.32.3
+torch==2.4.0
+torchvision==0.19.0
+Pillow==10.4.0
+pdf-annotate==0.12.0
+scipy==1.14.0
+opencv-python==4.10.0.84
+Shapely==2.0.5
+transformers==4.40.2
+huggingface_hub==0.23.5
+pdf2image==1.17.0
+lightgbm==4.5.0
+setuptools==75.4.0
+roman==4.2
+hydra-core==1.3.2
+pypandoc==1.13
+rapid-table==2.0.3
+rapidocr==3.2.0
+pix2tex==0.1.4
+latex2mathml==3.78.0
+PyMuPDF==1.25.5
+git+https://github.com/huridocs/pdf-features.git@2025.7.30.1
 gradio==4.44.1
 pytesseract
 python-docx

space-pdf/README.md ADDED Viewed

	@@ -0,0 +1,910 @@

+<h1 align="center">PDF Document Layout Analysis</h1>
+<p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
+<p align="center">
+  <img src="https://img.shields.io/badge/Python-3.10+-blue.svg" alt="Python Version">
+  <img src="https://img.shields.io/badge/FastAPI-0.111.1-green.svg" alt="FastAPI">
+  <img src="https://img.shields.io/badge/Docker-Ready-blue.svg" alt="Docker">
+  <img src="https://img.shields.io/badge/GPU-Supported-orange.svg" alt="GPU Support">
+</p>
+<div align="center">
+  <p><strong>Built with ❤️ by <a href="https://huridocs.org">HURIDOCS</a></strong></p>
+  <p>
+    <a href="https://github.com/huridocs/pdf-document-layout-analysis">⭐ Star us on GitHub</a> •
+    <a href="https://hub.docker.com/r/huridocs/pdf-document-layout-analysis">🐳 Pull from Docker Hub</a> •
+    <a href="https://huggingface.co/HURIDOCS/pdf-document-layout-analysis">🤗 View on Hugging Face</a>
+  </p>
+</div>
+---
+## 🚀 Overview
+This project provides a powerful and flexible PDF analysis microservice built with **Clean Architecture** principles. The service enables OCR, segmentation, and classification of different parts of PDF pages, identifying elements such as texts, titles, pictures, tables, formulas, and more. Additionally, it determines the correct reading order of these identified elements and can convert PDFs to various formats including Markdown and HTML.
+### ✨ Key Features
+- 🔍 **Advanced PDF Layout Analysis** - Segment and classify PDF content with high accuracy
+- 🖼️ **Visual & Fast Models** - Choose between VGT (Vision Grid Transformer) for accuracy or LightGBM for speed
+- 📝 **Multi-format Output** - Export to JSON, Markdown, HTML, and visualize PDF segmentations
+- 🌐 **OCR Support** - 150+ language support with Tesseract OCR
+- 📊 **Table & Formula Extraction** - Extract tables as HTML and formulas as LaTeX
+- 🏗️ **Clean Architecture** - Modular, testable, and maintainable codebase
+- 🐳 **Docker-Ready** - Easy deployment with GPU support
+- ⚡ **RESTful API** - Comprehensive API with 10+ endpoints
+<table>
+  <tr>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample1.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample2.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample3.png"/>
+    </td>
+    <td>
+      <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample4.png"/>
+    </td>
+  </tr>
+</table>
+### 🔗 Project Links
+- **GitHub**: [pdf-document-layout-analysis](https://github.com/huridocs/pdf-document-layout-analysis)
+- **HuggingFace**: [pdf-document-layout-analysis](https://huggingface.co/HURIDOCS/pdf-document-layout-analysis)
+- **DockerHub**: [pdf-document-layout-analysis](https://hub.docker.com/r/huridocs/pdf-document-layout-analysis/)
+---
+## 🚀 Quick Start
+### 1. Start the Service
+**With GPU support (recommended for better performance):**
+```bash
+make start
+```
+**Without GPU support:**
+```bash
+make start_no_gpu
+```
+The service will be available at `http://localhost:5060`
+**Check service status:**
+```bash
+curl http://localhost:5060/info
+```
+### 2. Basic PDF Analysis
+**Analyze a PDF document (VGT model - high accuracy):**
+```bash
+curl -X POST -F 'file=@/path/to/your/document.pdf' http://localhost:5060
+```
+**Fast analysis (LightGBM models - faster processing):**
+```bash
+curl -X POST -F 'file=@/path/to/your/document.pdf' -F "fast=true" http://localhost:5060
+```
+### 3. Stop the Service
+```bash
+make stop
+```
+> 💡 **Tip**: Replace `/path/to/your/document.pdf` with the actual path to your PDF file. The service will return a JSON response with segmented content and metadata.
+## 📋 Table of Contents
+- [🚀 Quick Start](#-quick-start)
+- [⚙️ Dependencies](#-dependencies)
+- [📋 Requirements](#-requirements)
+- [📚 API Reference](#-api-reference)
+- [💡 Usage Examples](#-usage-examples)
+- [🏗️ Architecture](#-architecture)
+- [🤖 Models](#-models)
+- [📊 Data](#-data)
+- [🔧 Development](#-development)
+- [📈 Benchmarks](#-benchmarks)
+  - [Performance](#performance)
+  - [Speed](#speed)
+- [🌐 Installation of More Languages for OCR](#-installation-of-more-languages-for-ocr)
+- [🔗 Related Services](#-related-services)
+- [🤝 Contributing](#-contributing)
+## ⚙️ Dependencies
+### Required
+- **Docker Desktop 4.25.0+** - [Installation Guide](https://www.docker.com/products/docker-desktop/)
+- **Python 3.10+** (for local development)
+### Optional
+- **NVIDIA Container Toolkit** - [Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (for GPU support)
+## 📋 Requirements
+### System Requirements
+- **RAM**: 2 GB minimum
+- **GPU Memory**: 5 GB (optional, will fallback to CPU if unavailable)
+- **Disk Space**: 10 GB for models and dependencies
+- **CPU**: Multi-core recommended for better performance
+### Docker Requirements
+- Docker Engine 20.10+
+- Docker Compose 2.0+
+## 📚 API Reference
+The service provides a comprehensive RESTful API with the following endpoints:
+### Core Analysis Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
+| `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
+| `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
+### Content Extraction Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/text` | POST | Extract text by content types | `file`, `fast`, `types` |
+| `/toc` | POST | Extract table of contents | `file`, `fast` |
+| `/toc_legacy_uwazi_compatible` | POST | Extract TOC (Uwazi compatible) | `file` |
+### Format Conversion Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
+| `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
+### OCR & Utility Endpoints
+| Endpoint | Method | Description | Parameters |
+|----------|--------|-------------|------------|
+| `/ocr` | POST | Apply OCR to PDF | `file`, `language` |
+| `/info` | GET | Get service information | - |
+| `/` | GET | Health check and system info | - |
+| `/error` | GET | Test error handling | - |
+### Common Parameters
+- **`file`**: PDF file to process (multipart/form-data)
+- **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
+- **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
+- **`language`**: OCR language code (string, default: "en")
+- **`types`**: Comma-separated content types to extract (string, default: "all")
+- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
+- **`dpi`**: Image resolution for conversion (integer, default: 120)
+## 💡 Usage Examples
+### Basic PDF Analysis
+**Standard analysis with VGT model:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060
+```
+**Fast analysis with LightGBM models:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'fast=true' \
+  http://localhost:5060
+```
+**Analysis with table and math parsing:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'parse_tables_and_math=true' \
+  http://localhost:5060
+```
+### Text Extraction
+**Extract all text:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'types=all' \
+  http://localhost:5060/text
+```
+**Extract specific content types:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'types=title,text,table' \
+  http://localhost:5060/text
+```
+### Format Conversion
+**Convert to Markdown:**
+```bash
+curl -X POST http://localhost:5060/markdown \
+  -F 'file=@document.pdf' \
+  -F 'extract_toc=true' \
+  -F 'output_file=document.md' \
+  --output 'document.zip'
+```
+**Convert to HTML:**
+```bash
+curl -X POST http://localhost:5060/html \
+  -F 'file=@document.pdf' \
+  -F 'extract_toc=true' \
+  -F 'output_file=document.html' \
+  --output 'document.zip'
+```
+> **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
+> - **Coordinates**: `left`, `top`, `width`, `height`
+> - **Page information**: `page_number`, `page_width`, `page_height`
+> - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
+### OCR Processing
+**OCR in English:**
+```bash
+curl -X POST \
+  -F 'file=@scanned_document.pdf' \
+  -F 'language=en' \
+  http://localhost:5060/ocr \
+  --output ocr_processed.pdf
+```
+**OCR in other languages:**
+```bash
+# French
+curl -X POST \
+  -F 'file=@document_french.pdf' \
+  -F 'language=fr' \
+  http://localhost:5060/ocr \
+  --output ocr_french.pdf
+# Spanish
+curl -X POST \
+  -F 'file=@document_spanish.pdf' \
+  -F 'language=es' \
+  http://localhost:5060/ocr \
+  --output ocr_spanish.pdf
+```
+### Visualization
+**Generate visualization PDF:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/visualize \
+  --output visualization.pdf
+```
+### Table of Contents Extraction
+**Extract structured TOC:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/toc
+```
+### XML Storage and Retrieval
+**Analyze and save XML:**
+```bash
+curl -X POST \
+  -F 'file=@document.pdf' \
+  http://localhost:5060/save_xml/my_analysis
+```
+**Retrieve saved XML:**
+```bash
+curl http://localhost:5060/get_xml/my_analysis.xml
+```
+### Service Information
+**Get service info and supported languages:**
+```bash
+curl http://localhost:5060/info
+```
+**Health check:**
+```bash
+curl http://localhost:5060/
+```
+### Response Format
+Most endpoints return JSON with segment information:
+```json
+[
+  {
+    "left": 72.0,
+    "top": 84.0,
+    "width": 451.2,
+    "height": 23.04,
+    "page_number": 1,
+    "page_width": 595.32,
+    "page_height": 841.92,
+    "text": "Document Title",
+    "type": "Title"
+  },
+  {
+    "left": 72.0,
+    "top": 120.0,
+    "width": 451.2,
+    "height": 200.0,
+    "page_number": 1,
+    "page_width": 595.32,
+    "page_height": 841.92,
+    "text": "This is the main text content...",
+    "type": "Text"
+  }
+]
+```
+### Supported Content Types
+- `Caption` - Image and table captions
+- `Footnote` - Footnote text
+- `Formula` - Mathematical formulas
+- `List item` - List items and bullet points
+- `Page footer` - Footer content
+- `Page header` - Header content
+- `Picture` - Images and figures
+- `Section header` - Section headings
+- `Table` - Table content
+- `Text` - Regular text paragraphs
+- `Title` - Document and section titles
+## 🏗️ Architecture
+This project follows **Clean Architecture** principles, ensuring separation of concerns, testability, and maintainability. The codebase is organized into distinct layers:
+### Directory Structure
+```
+src/
+├── domain/                 # Enterprise Business Rules
+│   ├── PdfImages.py       # PDF image handling domain logic
+│   ├── PdfSegment.py      # PDF segment entity
+│   ├── Prediction.py      # ML prediction entity
+│   └── SegmentBox.py      # Core segment box entity
+├── use_cases/             # Application Business Rules
+│   ├── pdf_analysis/      # PDF analysis use case
+│   ├── text_extraction/   # Text extraction use case
+│   ├── toc_extraction/    # Table of contents extraction
+│   ├── visualization/     # PDF visualization use case
+│   ├── ocr/              # OCR processing use case
+│   ├── markdown_conversion/ # Markdown conversion use case
+│   └── html_conversion/   # HTML conversion use case
+├── adapters/              # Interface Adapters
+│   ├── infrastructure/    # External service adapters
+│   ├── ml/               # Machine learning model adapters
+│   ├── storage/          # File storage adapters
+│   └── web/              # Web framework adapters
+├── ports/                 # Interface definitions
+│   ├── services/         # Service interfaces
+│   └── repositories/     # Repository interfaces
+└── drivers/              # Frameworks & Drivers
+    └── web/              # FastAPI application setup
+```
+### Layer Responsibilities
+- **Domain Layer**: Contains core business entities and rules independent of external concerns
+- **Use Cases Layer**: Orchestrates domain entities to fulfill specific application requirements
+- **Adapters Layer**: Implements interfaces defined by inner layers and adapts external frameworks
+- **Drivers Layer**: Contains frameworks, databases, and external agency configurations
+### Key Benefits
+- 🔄 **Dependency Inversion**: High-level modules don't depend on low-level modules
+- 🧪 **Testability**: Easy to unit test business logic in isolation
+- 🔧 **Maintainability**: Changes to external frameworks don't affect business rules
+- 📈 **Scalability**: Easy to add new features without modifying existing code
+## 🤖 Models
+The service offers two complementary model approaches, each optimized for different use cases:
+### 1. Vision Grid Transformer (VGT) - High Accuracy Model
+**Overview**: A state-of-the-art visual model developed by Alibaba Research Group that "sees" the entire page layout.
+**Key Features**:
+- 🎯 **High Accuracy**: Best-in-class performance on document layout analysis
+- 👁️ **Visual Understanding**: Analyzes the entire page context including spatial relationships
+- 📊 **Trained on DocLayNet**: Uses the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet)
+- 🔬 **Research-Backed**: Based on [Advanced Literate Machinery](https://github.com/AlibabaResearch/AdvancedLiterateMachinery)
+**Resource Requirements**:
+- GPU: 5GB+ VRAM (recommended)
+- CPU: Falls back automatically if GPU unavailable
+- Processing Speed: ~1.75 seconds/page (GPU [GTX 1070]) or ~13.5 seconds/page (CPU [i7-8700])
+### 2. LightGBM Models - Fast & Efficient
+**Overview**: Lightweight ensemble of two specialized models using XML-based features from Poppler.
+**Key Features**:
+- ⚡ **High Speed**: ~0.42 seconds per page on CPU (i7-8700)
+- 💾 **Low Resource Usage**: CPU-only, minimal memory footprint
+- 🔄 **Dual Model Approach**:
+  - **Token Type Classifier**: Identifies content types (title, text, table, etc.)
+  - **Segmentation Model**: Determines proper content boundaries
+- 📄 **XML-Based**: Uses Poppler's PDF-to-XML conversion for feature extraction
+**Trade-offs**:
+- Slightly lower accuracy compared to VGT
+- No visual context understanding
+- Excellent for batch processing and resource-constrained environments
+### OCR Integration
+Both models integrate seamlessly with OCR capabilities:
+- **Engine**: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
+- **Processing**: [ocrmypdf](https://ocrmypdf.readthedocs.io/en/latest/index.html)
+- **Languages**: 150+ supported languages
+- **Output**: Searchable PDFs with preserved layout
+### Model Selection Guide
+| Use Case | Recommended Model | Reason |
+|----------|------------------|---------|
+| High accuracy requirements | VGT | Superior visual understanding |
+| Batch processing | LightGBM | Faster processing, lower resources |
+| GPU available | VGT | Leverages GPU acceleration |
+| CPU-only environment | LightGBM | Optimized for CPU processing |
+| Real-time applications | LightGBM | Consistent fast response times |
+| Research/analysis | VGT | Best accuracy for detailed analysis |
+## 📊 Data
+### Training Dataset
+Both model types are trained on the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet), a large-scale document layout analysis dataset containing over 80,000 document pages.
+### Document Categories
+The models can identify and classify 11 distinct content types:
+| ID | Category | Description |
+|----|----------|-------------|
+| 1 | **Caption** | Image and table captions |
+| 2 | **Footnote** | Footnote references and text |
+| 3 | **Formula** | Mathematical equations and formulas |
+| 4 | **List item** | Bulleted and numbered list items |
+| 5 | **Page footer** | Footer content and page numbers |
+| 6 | **Page header** | Header content and titles |
+| 7 | **Picture** | Images, figures, and graphics |
+| 8 | **Section header** | Section and subsection headings |
+| 9 | **Table** | Tabular data and structures |
+| 10 | **Text** | Regular paragraph text |
+| 11 | **Title** | Document and chapter titles |
+### Dataset Characteristics
+- **Domain Coverage**: Academic papers, technical documents, reports
+- **Language**: Primarily English with multilingual support
+- **Quality**: High-quality annotations with bounding boxes and labels
+- **Diversity**: Various document layouts, fonts, and formatting styles
+For detailed information about the dataset, visit the [DocLayNet repository](https://github.com/DS4SD/DocLayNet).
+## 🔧 Development
+### Local Development Setup
+1. **Clone the repository:**
+   ```bash
+   git clone https://github.com/huridocs/pdf-document-layout-analysis.git
+   cd pdf-document-layout-analysis
+   ```
+2. **Create virtual environment:**
+   ```bash
+   make install_venv
+   ```
+3. **Activate environment:**
+   ```bash
+   make activate
+   # or manually: source .venv/bin/activate
+   ```
+4. **Install dependencies:**
+   ```bash
+   make install
+   ```
+### Code Quality
+**Format code:**
+```bash
+make formatter
+```
+**Check formatting:**
+```bash
+make check_format
+```
+### Testing
+**Run tests:**
+```bash
+make test
+```
+**Integration tests:**
+```bash
+# Tests are located in src/tests/integration/
+python -m pytest src/tests/integration/test_end_to_end.py
+```
+### Docker Development
+**Build and start (detached mode):**
+```bash
+# With GPU
+make start_detached_gpu
+# Without GPU
+make start_detached
+```
+**Clean up Docker resources:**
+```bash
+# Remove containers
+make remove_docker_containers
+# Remove images
+make remove_docker_images
+```
+### Project Structure
+```
+pdf-document-layout-analysis/
+├── src/                    # Source code
+│   ├── domain/            # Business entities
+│   ├── use_cases/         # Application logic
+│   ├── adapters/          # External integrations
+│   ├── ports/             # Interface definitions
+│   └── drivers/           # Framework configurations
+├── test_pdfs/             # Test PDF files
+├── models/                # ML model storage
+├── docker-compose.yml     # Docker configuration
+├── Dockerfile             # Container definition
+├── Makefile              # Development commands
+├── pyproject.toml        # Python project configuration
+└── requirements.txt      # Python dependencies
+```
+### Environment Variables
+Key configuration options:
+```bash
+# OCR configuration
+OCR_SOURCE=/tmp/ocr_source
+# Model paths (auto-configured)
+MODELS_PATH=./models
+# Service configuration
+HOST=0.0.0.0
+PORT=5060
+```
+### Adding New Features
+1. **Domain Logic**: Add entities in `src/domain/`
+2. **Use Cases**: Implement business logic in `src/use_cases/`
+3. **Adapters**: Create integrations in `src/adapters/`
+4. **Ports**: Define interfaces in `src/ports/`
+5. **Controllers**: Add endpoints in `src/adapters/web/`
+### Debugging
+**View logs:**
+```bash
+docker compose logs -f
+```
+**Access container:**
+```bash
+docker exec -it pdf-document-layout-analysis /bin/bash
+```
+**Free up disk space:**
+```bash
+make free_up_space
+```
+### Order of Output Elements
+The service returns SegmentBox elements in a carefully determined reading order:
+#### Reading Order Algorithm
+1. **Poppler Integration**: Uses [Poppler](https://poppler.freedesktop.org) PDF-to-XML conversion to establish initial token reading order
+2. **Segment Averaging**: Calculates average reading order for multi-token segments
+3. **Type-Based Sorting**: Prioritizes content types:
+   - **Headers** placed first
+   - **Main content** in reading order
+   - **Footers and footnotes** placed last
+#### Non-Text Elements
+For segments without text (e.g., images):
+- Processed after text-based sorting
+- Positioned based on nearest text segment proximity
+- Uses spatial distance as the primary criterion
+### Advanced Table and Formula Extraction
+#### Default Behavior
+- **Formulas**: Automatically extracted as LaTeX format in the `text` property
+- **Tables**: Basic text extraction included by default
+#### Enhanced Table Extraction
+Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
+```bash
+curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
+```
+#### Extraction Engines
+- **Formulas**: [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
+- **Tables**: [RapidTable](https://github.com/RapidAI/RapidTable)
+## 📈 Benchmarks
+### Performance
+VGT model performance on PubLayNet dataset:
+| Metric | Overall | Text | Title | List | Table | Figure |
+|--------|---------|------|-------|------|-------|--------|
+| **F1 Score** | **0.962** | 0.950 | 0.939 | 0.968 | 0.981 | 0.971 |
+> 📊 **Comparison**: View comprehensive model comparisons at [Papers With Code](https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val)
+### Speed
+Performance benchmarks on 15-page academic documents:
+| Model | Hardware | Speed (sec/page) | Use Case |
+|-------|----------|------------------|----------|
+| **LightGBM** | CPU (i7-8700 3.2GHz) | **0.42** | Fast processing |
+| **VGT** | GPU (GTX 1070) | **1.75** | High accuracy |
+| **VGT** | CPU (i7-8700 3.2GHz) | 13.5 | CPU fallback |
+### Performance Recommendations
+- **GPU Available**: Use VGT for best accuracy-speed balance
+- **CPU Only**: Use LightGBM for optimal performance
+- **Batch Processing**: LightGBM for consistent throughput
+- **High Accuracy**: VGT with GPU for best results
+## 🌐 Installation of More Languages for OCR
+The service uses Tesseract OCR with support for 150+ languages. The Docker image includes only common languages to minimize image size.
+### Installing Additional Languages
+#### 1. Access the Container
+```bash
+docker exec -it --user root pdf-document-layout-analysis /bin/bash
+```
+#### 2. Install Language Packs
+```bash
+# Install specific language
+apt-get update
+apt-get install tesseract-ocr-[LANGCODE]
+```
+#### 3. Common Language Examples
+```bash
+# Korean
+apt-get install tesseract-ocr-kor
+# German
+apt-get install tesseract-ocr-deu
+# French
+apt-get install tesseract-ocr-fra
+# Spanish
+apt-get install tesseract-ocr-spa
+# Chinese Simplified
+apt-get install tesseract-ocr-chi-sim
+# Arabic
+apt-get install tesseract-ocr-ara
+# Japanese
+apt-get install tesseract-ocr-jpn
+```
+#### 4. Verify Installation
+```bash
+curl http://localhost:5060/info
+```
+### Language Code Reference
+Find Tesseract language codes in the [ISO to Tesseract mapping](https://github.com/huridocs/pdf-document-layout-analysis/blob/main/src/adapters/infrastructure/ocr/languages.py).
+### Supported Languages
+Common language codes:
+- `eng` - English
+- `fra` - French
+- `deu` - German
+- `spa` - Spanish
+- `ita` - Italian
+- `por` - Portuguese
+- `rus` - Russian
+- `chi-sim` - Chinese Simplified
+- `chi-tra` - Chinese Traditional
+- `jpn` - Japanese
+- `kor` - Korean
+- `ara` - Arabic
+- `hin` - Hindi
+### Usage with Multiple Languages
+```bash
+# OCR with specific language
+curl -X POST \
+  -F 'file=@document.pdf' \
+  -F 'language=fr' \
+  http://localhost:5060/ocr \
+  --output french_ocr.pdf
+```
+## 🔗 Related Services
+Explore our ecosystem of PDF processing services built on this foundation:
+### [PDF Table of Contents Extractor](https://github.com/huridocs/pdf-table-of-contents-extractor)
+🔍 **Purpose**: Intelligent extraction of structured table of contents from PDF documents
+**Key Features**:
+- Leverages layout analysis for accurate TOC identification
+- Hierarchical structure recognition
+- Multiple output formats supported
+- Integration-ready API
+### [PDF Text Extraction](https://github.com/huridocs/pdf-text-extraction)
+📝 **Purpose**: Advanced text extraction with layout awareness
+**Key Features**:
+- Content-type aware extraction
+- Preserves document structure
+- Reading order optimization
+- Clean text output with metadata
+### Integration Benefits
+These services work seamlessly together:
+- **Shared Analysis**: Reuse layout analysis results across services
+- **Consistent Output**: Standardized JSON format for easy integration
+- **Scalable Architecture**: Deploy services independently or together
+- **Docker Ready**: All services containerized for easy deployment
+## 🤝 Contributing
+We welcome contributions to improve the PDF Document Layout Analysis service!
+### How to Contribute
+1. **Fork the Repository**
+   ```bash
+   git clone https://github.com/your-username/pdf-document-layout-analysis.git
+   ```
+2. **Create a Feature Branch**
+   ```bash
+   git checkout -b feature/your-feature-name
+   ```
+3. **Set Up Development Environment**
+   ```bash
+   make install_venv
+   make install
+   ```
+4. **Make Your Changes**
+   - Follow the Clean Architecture principles
+   - Add tests for new features
+   - Update documentation as needed
+5. **Run Tests and Quality Checks**
+   ```bash
+   make test
+   make check_format
+   ```
+6. **Submit a Pull Request**
+   - Provide clear description of changes
+   - Include test results
+   - Reference any related issues
+### Contribution Guidelines
+#### Code Standards
+- **Python**: Follow PEP 8 with 125-character line length
+- **Architecture**: Maintain Clean Architecture boundaries
+- **Testing**: Include unit tests for new functionality
+- **Documentation**: Update README and docstrings
+#### Areas for Contribution
+- 🐛 **Bug Fixes**: Report and fix issues
+- ✨ **New Features**: Add new endpoints or functionality
+- 📚 **Documentation**: Improve guides and examples
+- 🧪 **Testing**: Expand test coverage
+- 🚀 **Performance**: Optimize processing speed
+- 🌐 **Internationalization**: Add language support
+#### Development Workflow
+1. **Issue First**: Create or comment on relevant issues
+2. **Small PRs**: Keep pull requests focused and manageable
+3. **Clean Commits**: Use descriptive commit messages
+4. **Documentation**: Update relevant documentation
+5. **Testing**: Ensure all tests pass
+### Getting Help
+- 📚 **Documentation**: Check this README and inline docs
+- 💬 **Issues**: Search existing issues or create new ones
+- 🔍 **Code**: Explore the codebase structure
+- 📧 **Contact**: Reach out to maintainers for guidance
+---
+### License
+This project is licensed under the terms specified in the [LICENSE](LICENSE) file.

space-pdf/app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+import tempfile
+import os
+import shutil
+import subprocess
+from pathlib import Path
+SCRIPT_DIR = Path(__file__).resolve().parent
+def run_cmd(cmd, cwd=None, env=None):
+    """Run a command, print nice logs, and also save them to run.log in cwd."""
+    cwd = str(cwd or os.getcwd())
+    print(f"🟦 Running: {' '.join(cmd)}  (cwd={cwd})")
+    proc = subprocess.run(
+        cmd,
+        cwd=cwd,
+        env=env,
+        capture_output=True,
+        text=True
+    )
+    if proc.stdout:
+        print("🟩 STDOUT:")
+        print(proc.stdout)
+    if proc.stderr:
+        print("🟥 STDERR:")
+        print(proc.stderr)
+    # Save to run.log for debugging
+    try:
+        runlog = Path(cwd) / "run.log"
+        with open(runlog, "a", encoding="utf-8") as f:
+            f.write(f"$ {' '.join(cmd)}\n")
+            if proc.stdout:
+                f.write(proc.stdout + "\n")
+            if proc.stderr:
+                f.write(proc.stderr + "\n")
+        print(f"🧾 Run log saved to: {runlog}")
+    except Exception as e:
+        print(f"⚠️ Could not write run.log: {e}")
+    if proc.returncode != 0:
+        # Let Gradio see the failure so it surfaces properly
+        raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
+    return proc
+def _locate_pdf_json(temp_dir: str) -> str:
+    """
+    Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
+    Find it (and a few common fallbacks). Raise if not found.
+    """
+    td = Path(temp_dir)
+    # Prefer exactly-named file if present
+    candidates = [
+        td / "pdf_data.json",                    # legacy name (if ever created)
+        td / "input_comprehensive_data.json",    # most common from your logs
+        td / "comprehensive_data.json",          # another common alias
+        td / "output.json",                      # generic
+    ]
+    for p in candidates:
+        if p.exists():
+            print(f"✅ Using PDF JSON: {p}")
+            return str(p)
+    # Generic pattern: anything *_comprehensive_data.json
+    globs = list(td.glob("*_comprehensive_data.json"))
+    if globs:
+        print(f"✅ Using PDF JSON (glob): {globs[0]}")
+        return str(globs[0])
+    # If still not found, surface a helpful error
+    searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
+    raise FileNotFoundError(
+        f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
+    )
+def process_files(pdf_file, word_file):
+    # Create a unique temporary directory for this run
+    temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
+    print(f"📂 Temp dir: {temp_dir}")
+    # Define standard filenames for use in the pipeline
+    pdf_path = os.path.join(temp_dir, "input.pdf")
+    word_path = os.path.join(temp_dir, "input.docx")
+    word_json_path = os.path.join(temp_dir, "word_data.json")
+    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
+    final_docx_path = os.path.join(temp_dir, "updated.docx")
+    # Copy the uploaded files to the temp directory
+    shutil.copy(pdf_file, pdf_path)
+    print(f"📄 PDF copied to: {pdf_path}")
+    shutil.copy(word_file, word_path)
+    print(f"📝 DOCX copied to: {word_path}")
+    # 1) PDF → JSON  (extractor writes <stem>_comprehensive_data.json into cwd)
+    run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)
+    # Find the JSON produced by the extractor
+    pdf_json_path = _locate_pdf_json(temp_dir)
+    # 2) DOCX red text → JSON
+    run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)
+    # 3) Merge JSON (uses the resolved pdf_json_path)
+    run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)
+    # 4) Apply updates to DOCX
+    run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)
+    # Return the final .docx file
+    return final_docx_path
+iface = gr.Interface(
+    fn=process_files,
+    inputs=[
+        gr.File(label="Upload PDF File", type="filepath"),
+        gr.File(label="Upload Word File", type="filepath")
+    ],
+    outputs=gr.File(label="Download Updated Word File"),
+    title="Red Text Replacer",
+    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
+)
+if __name__ == "__main__":
+    iface.launch()

space-pdf/extract_pdf_data.py ADDED Viewed

	@@ -0,0 +1,534 @@

+#!/usr/bin/env python3
+"""
+Fixed PDF Data Extractor - Addresses key issues in comprehensive_extract.py
+Key fixes:
+1. Better table extraction and cleaning
+2. Improved key-value pair extraction
+3. More robust text processing
+4. Enhanced vehicle registration extraction
+5. Better date/number pattern recognition
+"""
+import json
+import re
+import pandas as pd
+from typing import Dict, List, Any, Optional
+import logging
+from pathlib import Path
+import sys
+from datetime import datetime
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fixed_pdf_extractor")
+class FixedPDFExtractor:
+    def __init__(self):
+        logger.info("🚀 Initializing Fixed PDF Extractor")
+    def extract_everything(self, pdf_path: str) -> Dict[str, Any]:
+        if not HAS_PDFPLUMBER:
+            raise RuntimeError("pdfplumber is required. Install with: pip install pdfplumber")
+        logger.info(f"📖 Processing PDF: {pdf_path}")
+        result = {
+            "document_info": {
+                "filename": Path(pdf_path).name,
+                "total_pages": 0,
+                "extraction_timestamp": datetime.now().isoformat()
+            },
+            "extracted_data": {
+                "all_text_content": [],
+                "all_tables": [],
+                "key_value_pairs": {},
+                "audit_information": {},
+                "operator_information": {},
+                "vehicle_registrations": [],
+                "driver_records": [],
+                "compliance_summary": {},
+                "dates_and_numbers": {}
+            }
+        }
+        all_text_blocks, all_tables = [], []
+        with pdfplumber.open(pdf_path) as pdf:
+            result["document_info"]["total_pages"] = len(pdf.pages)
+            for page_num, page in enumerate(pdf.pages, 1):
+                logger.info(f"📄 Processing page {page_num}")
+                # Extract text with better handling
+                page_text = self._extract_page_text(page)
+                if page_text:
+                    all_text_blocks.append({
+                        "page": page_num,
+                        "text": page_text,
+                        "word_count": len(page_text.split())
+                    })
+                # Extract tables with improved cleaning
+                tables = self._extract_page_tables(page, page_num)
+                all_tables.extend(tables)
+        result["extracted_data"]["all_text_content"] = all_text_blocks
+        result["extracted_data"]["all_tables"] = all_tables
+        # Process extracted data with improved methods
+        combined_text = "\n\n".join(b["text"] for b in all_text_blocks)
+        result["extracted_data"]["key_value_pairs"] = self._extract_key_value_pairs_improved(combined_text)
+        result["extracted_data"]["audit_information"] = self._extract_audit_info(combined_text, all_tables)
+        result["extracted_data"]["operator_information"] = self._extract_operator_info(combined_text, all_tables)
+        result["extracted_data"]["vehicle_registrations"] = self._extract_vehicle_registrations(all_tables)
+        result["extracted_data"]["driver_records"] = self._extract_driver_records(all_tables)
+        result["extracted_data"]["compliance_summary"] = self._extract_compliance_summary(combined_text, all_tables)
+        result["extracted_data"]["dates_and_numbers"] = self._extract_dates_and_numbers_improved(combined_text)
+        # Generate summary
+        result["extraction_summary"] = {
+            "text_blocks_found": len(all_text_blocks),
+            "tables_found": len(all_tables),
+            "key_value_pairs_found": len(result["extracted_data"]["key_value_pairs"]),
+            "vehicle_registrations_found": len(result["extracted_data"]["vehicle_registrations"]),
+            "driver_records_found": len(result["extracted_data"]["driver_records"]),
+            "total_characters": len(combined_text),
+            "processing_timestamp": datetime.now().isoformat()
+        }
+        logger.info("✅ Extraction completed!")
+        return result
+    def _extract_page_text(self, page) -> Optional[str]:
+        """Extract text from page with better handling"""
+        try:
+            text = page.extract_text()
+            if text:
+                # Clean up text
+                text = re.sub(r'[ \t]+', ' ', text.strip())
+                text = re.sub(r'\n\s*\n', '\n', text)
+                return text
+        except Exception as e:
+            logger.warning(f"Failed to extract text from page: {e}")
+        return None
+    def _extract_page_tables(self, page, page_num: int) -> List[Dict]:
+        """Extract tables with improved processing"""
+        tables = []
+        try:
+            raw_tables = page.extract_tables()
+            if raw_tables:
+                for table_idx, table in enumerate(raw_tables):
+                    cleaned_table = self._clean_table_improved(table)
+                    if cleaned_table and len(cleaned_table) > 0:
+                        tables.append({
+                            "page": page_num,
+                            "table_index": table_idx + 1,
+                            "headers": cleaned_table[0] if cleaned_table else [],
+                            "data": cleaned_table[1:] if len(cleaned_table) > 1 else [],
+                            "raw_data": cleaned_table,
+                            "row_count": len(cleaned_table) - 1 if len(cleaned_table) > 1 else 0,
+                            "column_count": len(cleaned_table[0]) if cleaned_table else 0
+                        })
+        except Exception as e:
+            logger.warning(f"Failed to extract tables from page {page_num}: {e}")
+        return tables
+    def _clean_table_improved(self, table: List[List]) -> List[List[str]]:
+        """Improved table cleaning with better cell processing"""
+        if not table:
+            return []
+        cleaned = []
+        for row in table:
+            cleaned_row = []
+            for cell in row:
+                if cell is None:
+                    cleaned_cell = ""
+                else:
+                    cleaned_cell = str(cell).strip()
+                    cleaned_cell = re.sub(r'\s+', ' ', cleaned_cell)
+                    cleaned_cell = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', cleaned_cell)
+                cleaned_row.append(cleaned_cell)
+            if any(cell.strip() for cell in cleaned_row):
+                cleaned.append(cleaned_row)
+        # Optional: collapse single-column tables of empty strings
+        if cleaned and all(len(r) == len(cleaned[0]) for r in cleaned):
+            return cleaned
+        return cleaned
+    def _extract_key_value_pairs_improved(self, text: str) -> Dict[str, str]:
+        """Improved key-value pair extraction with better cleaning"""
+        pairs: Dict[str, str] = {}
+        # Normalize text a bit for regex stability
+        t = text.replace('\r', '\n')
+        # Pattern 1: colon-separated pairs (key: value)
+        pattern1 = re.compile(
+            r'([A-Za-z][\w\s()/\-.]{2,80}?):\s*([^\n\r:][^\n\r]*)'
+        )
+        for key, val in pattern1.findall(t):
+            k = key.strip()
+            v = val.strip()
+            # Filter junk: very long values, pure separators, or obvious headers
+            if not v or len(v) > 200:
+                continue
+            if re.fullmatch(r'[-_/\.]+', v):
+                continue
+            # Avoid capturing the next key as value by trimming trailing key-like tokens
+            v = re.sub(r'\s+[A-Z][\w\s()/\-.]{2,40}:$', '', v).strip()
+            # Skip values that are just long digit runs (likely id lists without meaning)
+            if re.fullmatch(r'\d{6,}', v):
+                continue
+            pairs[k] = v
+        # Pattern 2: inline “Key – Value” or “Key — Value”
+        pattern2 = re.compile(r'([A-Za-z][\w\s()/\-.]{2,80}?)\s*[–—-]\s*([^\n\r]+)')
+        for key, val in pattern2.findall(t):
+            k = key.strip()
+            v = val.strip()
+            if v and len(v) <= 200 and not re.fullmatch(r'\d{6,}', v):
+                pairs.setdefault(k, v)
+        return pairs
+    def _extract_audit_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract audit-specific information with better filtering"""
+        audit_info: Dict[str, Any] = {}
+        # Prefer tables
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            joined = ' '.join(headers)
+            if "audit information" in joined or "auditinformation" in joined:
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2 and row[0] and row[1]:
+                        key = str(row[0]).strip()
+                        value = str(row[1]).strip()
+                        # Skip numbered list rows (e.g., "1.", "2)")
+                        if re.match(r'^\s*\d+\s*[.)]\s*$', key):
+                            continue
+                        if key and value:
+                            audit_info[key] = value
+        # Backup from text
+        candidates = {
+            "Date of Audit": r'Date\s+of\s+Audit[:\s]*([^\n\r]+)',
+            "Location of audit": r'Location\s+of\s+audit[:\s]*([^\n\r]+)',
+            "Auditor name": r'Auditor\s+name[:\s]*([^\n\r]+)',
+            "Audit Matrix Identifier (Name or Number)": r'Audit\s+Matrix\s+Identifier.*?[:\s]*([^\n\r]+)',
+        }
+        for k, pat in candidates.items():
+            if k not in audit_info:
+                m = re.search(pat, text, re.IGNORECASE)
+                if m:
+                    audit_info[k] = m.group(1).strip()
+        return audit_info
+    def _extract_operator_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract operator information with better table parsing"""
+        operator_info: Dict[str, Any] = {}
+        # Look for operator information in tables first
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            if ("operatorinformation" in ' '.join(headers) or
+                "operator information" in ' '.join(headers) or
+                "operatorcontactdetails" in ' '.join(headers)):
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2 and row[0] and row[1]:
+                        key = str(row[0]).strip()
+                        value = str(row[1]).strip()
+                        if key and value:
+                            # Clean up key names
+                            kl = key.lower()
+                            if "operator name" in kl:
+                                operator_info["operator_name"] = value
+                            elif "trading name" in kl:
+                                operator_info["trading_name"] = value
+                            elif "company number" in kl:
+                                if len(row) > 2:
+                                    company_parts = [str(r).strip() for r in row[1:] if str(r).strip()]
+                                    operator_info["company_number"] = "".join(company_parts)
+                                else:
+                                    operator_info["company_number"] = value
+                            elif "business address" in kl:
+                                operator_info["business_address"] = value
+                            elif "postal address" in kl:
+                                operator_info["postal_address"] = value
+                            elif "email" in kl:
+                                operator_info["email"] = value
+                            elif "telephone" in kl or "phone" in kl:
+                                operator_info["phone"] = value
+                            elif "nhvas accreditation" in kl:
+                                operator_info["nhvas_accreditation"] = value
+                            elif "nhvas manual" in kl:
+                                operator_info["nhvas_manual"] = value
+        # Extract from text patterns as backup
+        patterns = {
+            'operator_name': r'Operator\s*name[:\s\(]*([^\n\r\)]+?)(?=\s*NHVAS|\s*Registered|$)',
+            'trading_name': r'Registered\s*trading\s*name[:\s\/]*([^\n\r]+?)(?=\s*Australian|$)',
+            'company_number': r'Australian\s*Company\s*Number[:\s]*([0-9\s]+?)(?=\s*NHVAS|$)',
+            'business_address': r'Operator\s*business\s*address[:\s]*([^\n\r]+?)(?=\s*Operator\s*Postal|$)',
+            'postal_address': r'Operator\s*Postal\s*address[:\s]*([^\n\r]+?)(?=\s*Email|$)',
+            'email': r'Email\s*address[:\s]*([^\s\n\r]+)',
+            'phone': r'Operator\s*Telephone\s*Number[:\s]*([^\s\n\r]+)',
+            'nhvas_accreditation': r'NHVAS\s*Accreditation\s*No\.[:\s\(]*([^\n\r\)]+)',
+        }
+        for key, pattern in patterns.items():
+            if key not in operator_info:  # Only use text if not found in tables
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    value = match.group(1).strip()
+                    if value and len(value) < 200:
+                        if key == 'company_number':
+                            value = re.sub(r'\s+', '', value)
+                        operator_info[key] = value
+        return operator_info
+    def _extract_vehicle_registrations(self, tables: List[Dict]) -> List[Dict]:
+        """Extract vehicle registration information from tables"""
+        vehicles: List[Dict[str, Any]] = []
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            # Look for vehicle registration tables
+            if any(keyword in ' '.join(headers) for keyword in ['registration', 'vehicle', 'number']):
+                reg_col = None
+                for i, header in enumerate(headers):
+                    if 'registration' in header and 'number' in header:
+                        reg_col = i
+                        break
+                if reg_col is not None:
+                    data = table.get("data", [])
+                    for row in data:
+                        if len(row) > reg_col and row[reg_col]:
+                            reg_num = str(row[reg_col]).strip()
+                            # Validate registration format (letters/numbers)
+                            if re.match(r'^[A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3}$', reg_num):
+                                vehicle_info = {"registration_number": reg_num}
+                                # Add other columns as additional info
+                                for i, header in enumerate(table.get("headers", [])):
+                                    if i < len(row) and i != reg_col:
+                                        vehicle_info[str(header)] = str(row[i]).strip()
+                                vehicles.append(vehicle_info)
+        return vehicles
+    def _extract_driver_records(self, tables: List[Dict]) -> List[Dict]:
+        """Extract driver records from tables"""
+        drivers: List[Dict[str, Any]] = []
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            # Look for driver/scheduler tables
+            if any(keyword in ' '.join(headers) for keyword in ['driver', 'scheduler', 'name']):
+                name_col = None
+                for i, header in enumerate(headers):
+                    if 'name' in header:
+                        name_col = i
+                        break
+                if name_col is not None:
+                    data = table.get("data", [])
+                    for row in data:
+                        if len(row) > name_col and row[name_col]:
+                            name = str(row[name_col]).strip()
+                            # Basic name validation
+                            if re.match(r'^[A-Za-z\s]{2,}$', name) and len(name.split()) >= 2:
+                                driver_info = {"name": name}
+                                # Add other columns
+                                for i, header in enumerate(table.get("headers", [])):
+                                    if i < len(row) and i != name_col:
+                                        driver_info[str(header)] = str(row[i]).strip()
+                                drivers.append(driver_info)
+        return drivers
+    def _extract_compliance_summary(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
+        """Extract compliance information"""
+        compliance = {
+            "standards_compliance": {},
+            "compliance_codes": {},
+            "audit_results": []
+        }
+        # Look for compliance tables
+        for table in tables:
+            headers = [str(h).lower() for h in table.get("headers", [])]
+            if any(keyword in ' '.join(headers) for keyword in ['compliance', 'standard', 'requirement']):
+                data = table.get("data", [])
+                for row in data:
+                    if len(row) >= 2:
+                        standard = str(row[0]).strip()
+                        code = str(row[1]).strip()
+                        if standard.startswith('Std') and code in ['V', 'NC', 'SFI', 'NAP', 'NA']:
+                            compliance["standards_compliance"][standard] = code
+        # Extract compliance codes definitions
+        code_patterns = {
+            'V': r'\bV\b\s+([^\n\r]+)',
+            'NC': r'\bNC\b\s+([^\n\r]+)',
+            'SFI': r'\bSFI\b\s+([^\n\r]+)',
+            'NAP': r'\bNAP\b\s+([^\n\r]+)',
+            'NA': r'\bNA\b\s+([^\n\r]+)',
+        }
+        for code, pattern in code_patterns.items():
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                compliance["compliance_codes"][code] = match.group(1).strip()
+        return compliance
+    def _extract_dates_and_numbers_improved(self, text: str) -> Dict[str, Any]:
+        """Improved date and number extraction"""
+        result = {
+            "dates": [],
+            "registration_numbers": [],
+            "phone_numbers": [],
+            "email_addresses": [],
+            "reference_numbers": []
+        }
+        # Date patterns
+        date_patterns = [
+            r'\b(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})\b',
+            r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
+            r'\b(\d{1,2}-\d{1,2}-\d{4})\b',
+            r'\b(\d{1,2}\.\d{1,2}\.\d{4})\b',
+        ]
+        for pattern in date_patterns:
+            result["dates"].extend(re.findall(pattern, text))
+        # Registration numbers (Australian format-ish)
+        reg_pattern = r'\b([A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3})\b'
+        result["registration_numbers"] = list(set(re.findall(reg_pattern, text)))
+        # Phone numbers (AU)
+        phone_pattern = r'\b((?:\+61|0)[2-9]\s?\d{4}\s?\d{4})\b'
+        result["phone_numbers"] = list(set(re.findall(phone_pattern, text)))
+        # Email addresses
+        email_pattern = r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b'
+        result["email_addresses"] = list(set(re.findall(email_pattern, text)))
+        # Reference numbers
+        ref_patterns = [
+            (r'RF(?:S)?\s*#?\s*(\d+)', 'RFS_Certifications'),
+            (r'NHVAS\s+Accreditation\s+No\.?\s*(\d+)', 'NHVAS_Numbers'),
+            (r'Registration\s+Number\s*#?\s*(\d+)', 'Registration_Numbers'),
+        ]
+        for pattern, key in ref_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            if matches:
+                result["reference_numbers"].extend([f"{key}: {m}" for m in matches])
+        return result
+    @staticmethod
+    def save_results(results: Dict[str, Any], output_path: str):
+        """Save results to JSON file"""
+        try:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+            logger.info(f"💾 Results saved to {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to save results: {e}")
+    @staticmethod
+    def export_to_excel(results: Dict[str, Any], excel_path: str):
+        """Export results to Excel with improved formatting"""
+        try:
+            with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
+                # Summary sheet
+                summary_data = []
+                extraction_summary = results.get("extraction_summary", {})
+                for key, value in extraction_summary.items():
+                    summary_data.append({"Metric": key.replace("_", " ").title(), "Value": value})
+                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
+                # Key-value pairs
+                kv_pairs = results.get("extracted_data", {}).get("key_value_pairs", {})
+                if kv_pairs:
+                    kv_df = pd.DataFrame(list(kv_pairs.items()), columns=['Key', 'Value'])
+                    kv_df.to_excel(writer, sheet_name='Key_Value_Pairs', index=False)
+                # Vehicle registrations
+                vehicles = results.get("extracted_data", {}).get("vehicle_registrations", [])
+                if vehicles:
+                    pd.DataFrame(vehicles).to_excel(writer, sheet_name='Vehicle_Registrations', index=False)
+                # Driver records
+                drivers = results.get("extracted_data", {}).get("driver_records", [])
+                if drivers:
+                    pd.DataFrame(drivers).to_excel(writer, sheet_name='Driver_Records', index=False)
+                # Compliance summary
+                compliance = results.get("extracted_data", {}).get("compliance_summary", {})
+                if compliance.get("standards_compliance"):
+                    comp_df = pd.DataFrame(list(compliance["standards_compliance"].items()),
+                                           columns=['Standard', 'Compliance_Code'])
+                    comp_df.to_excel(writer, sheet_name='Compliance_Standards', index=False)
+                logger.info(f"📊 Results exported to Excel: {excel_path}")
+        except Exception as e:
+            logger.error(f"Failed to export to Excel: {e}")
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python fixed_pdf_extractor.py <pdf_path>")
+        sys.exit(1)
+    pdf_path = Path(sys.argv[1])
+    if not pdf_path.exists():
+        print(f"❌ PDF not found: {pdf_path}")
+        sys.exit(1)
+    print("🚀 Fixed PDF Data Extractor")
+    print("=" * 50)
+    extractor = FixedPDFExtractor()
+    results = extractor.extract_everything(str(pdf_path))
+    base = pdf_path.stem
+    output_dir = pdf_path.parent
+    # Save outputs
+    json_path = output_dir / f"{base}_comprehensive_data.json"
+    excel_path = output_dir / f"{base}_fixed_extraction.xlsx"
+    FixedPDFExtractor.save_results(results, str(json_path))
+    FixedPDFExtractor.export_to_excel(results, str(excel_path))
+    print("\n💾 OUTPUT FILES:")
+    print(f"   📄 JSON Data: {json_path}")
+    print(f"   📊 Excel Data: {excel_path}")
+    print(f"\n✨ FIXED EXTRACTION COMPLETE!")
+if __name__ == "__main__":
+    main()

space-pdf/extract_red_text.py ADDED Viewed

	@@ -0,0 +1,764 @@

+#!/usr/bin/env python3
+import re
+import json
+import sys
+from docx import Document
+from docx.oxml.ns import qn
+from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
+def normalize_header_label(s: str) -> str:
+    """Normalize a header/label by stripping parentheticals & punctuation."""
+    s = re.sub(r"\s+", " ", s.strip())
+    # remove content in parentheses/brackets
+    s = re.sub(r"\([^)]*\)", "", s)
+    s = re.sub(r"\[[^]]*\]", "", s)
+    # unify slashes and hyphens, collapse spaces
+    s = s.replace("–", "-").replace("—", "-").replace("/", " / ").replace("  ", " ")
+    return s.strip()
+# Canonical label aliases for Vehicle/Maintenance/General headers
+LABEL_ALIASES = {
+    # Vehicle Registration (Maintenance)
+    "roadworthiness certificates": "Roadworthiness Certificates",
+    "maintenance records": "Maintenance Records",
+    "daily checks": "Daily Checks",
+    "fault recording / reporting": "Fault Recording/ Reporting",
+    "fault repair": "Fault Repair",
+    # Vehicle Registration (Mass)
+    "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance",
+    "weight verification records": "Weight Verification Records",
+    "rfs suspension certification #": "RFS Suspension Certification #",
+    "suspension system maintenance": "Suspension System Maintenance",
+    "trip records": "Trip Records",
+    "fault recording/ reporting on suspension system": "Fault Recording/ Reporting on Suspension System",
+    # Common
+    "registration number": "Registration Number",
+    "no.": "No.",
+    "sub contractor": "Sub contractor",
+    "sub-contractor": "Sub contractor",
+}
+def looks_like_operator_declaration(context):
+    """True iff heading says Operator Declaration and headers include Print Name + Position Title."""
+    heading = (context.get("heading") or "").strip().lower()
+    headers = " ".join(context.get("headers") or []).lower()
+    return (
+        "operator declaration" in heading
+        and "print name" in headers
+        and "position" in headers
+        and "title" in headers
+    )
+def looks_like_auditor_declaration(context):
+    heading = (context.get("heading") or "").strip().lower()
+    headers = " ".join(context.get("headers") or []).lower()
+    return (
+        "auditor declaration" in heading
+        and "print name" in headers
+        and ("nhvr" in headers or "auditor registration number" in headers)
+    )
+# --- NEW: header-only fallback that ignores headings and just keys on the two column names
+def extract_operator_declaration_by_headers_from_end(doc):
+    """
+    Scan tables from the end; if a table's first row contains both
+    'Print Name' AND 'Position Title' (case-insensitive), extract red text
+    from the data rows into:
+        {"Print Name": [...], "Position Title": [...]}
+    """
+    for tbl in reversed(doc.tables):
+        if len(tbl.rows) < 2:
+            continue  # need header + at least one data row
+        headers_norm = [normalize_header_label(c.text).lower() for c in tbl.rows[0].cells]
+        has_print   = any("print name" in h for h in headers_norm)
+        has_pos_tit = any(("position title" in h) or ("position" in h and "title" in h) for h in headers_norm)
+        if not (has_print and has_pos_tit):
+            continue
+        idx_print = next((i for i, h in enumerate(headers_norm) if "print name" in h), None)
+        idx_pos   = next((i for i, h in enumerate(headers_norm) if "position title" in h), None)
+        if idx_pos is None:
+            idx_pos = next((i for i, h in enumerate(headers_norm) if ("position" in h and "title" in h)), None)
+        result = {"Print Name": [], "Position Title": []}
+        for row in tbl.rows[1:]:
+            if idx_print is not None and idx_print < len(row.cells):
+                cell = row.cells[idx_print]
+                reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
+                reds = coalesce_numeric_runs(reds)
+                txt  = normalize_text(" ".join(reds))
+                if txt:
+                    result["Print Name"].append(txt)
+            if idx_pos is not None and idx_pos < len(row.cells):
+                cell = row.cells[idx_pos]
+                reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
+                reds = coalesce_numeric_runs(reds)
+                txt  = normalize_text(" ".join(reds))
+                if txt:
+                    result["Position Title"].append(txt)
+        if result["Print Name"] or result["Position Title"]:
+            return {k: v for k, v in result.items() if v}
+    return None
+# --- end NEW helper
+def canonicalize_label(s: str) -> str:
+    key = normalize_header_label(s).lower()
+    key = re.sub(r"\s+", " ", key)
+    return LABEL_ALIASES.get(key, s)
+def bag_similarity(a: str, b: str) -> float:
+    """Loose bag-of-words similarity for header↔label matching."""
+    aw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(a).lower()) if len(w) > 2 or w in {"#","no"}}
+    bw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(b).lower()) if len(w) > 2 or w in {"#","no"}}
+    if not aw or not bw:
+        return 0.0
+    inter = len(aw & bw)
+    return inter / max(len(aw), len(bw))
+def coalesce_numeric_runs(text_list):
+    """
+    If a cell yields ['4','5','6','9','8','7','1','2','3'] etc., join continuous single-char digit runs.
+    Returns ['456987123'] instead of many singles. Non-digit tokens are preserved.
+    """
+    out, buf = [], []
+    for t in text_list:
+        if len(t) == 1 and t.isdigit():
+            buf.append(t)
+        else:
+            if buf:
+                out.append("".join(buf))
+                buf = []
+            out.append(t)
+    if buf:
+        out.append("".join(buf))
+    return out
+def is_red_font(run):
+    """Enhanced red font detection with better color checking"""
+    col = run.font.color
+    if col and col.rgb:
+        r, g, b = col.rgb
+        if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
+            return True
+    rPr = getattr(run._element, "rPr", None)
+    if rPr is not None:
+        clr = rPr.find(qn('w:color'))
+        if clr is not None:
+            val = clr.get(qn('w:val'))
+            if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
+                rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
+                if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
+                    return True
+    return False
+def _prev_para_text(tbl):
+    """Get text from previous paragraph before table"""
+    prev = tbl._tbl.getprevious()
+    while prev is not None and not prev.tag.endswith("}p"):
+        prev = prev.getprevious()
+    if prev is None:
+        return ""
+    return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
+def normalize_text(text):
+    """Normalize text for better matching"""
+    return re.sub(r'\s+', ' ', text.strip())
+def fuzzy_match_heading(heading, patterns):
+    """Check if heading matches any pattern with fuzzy matching"""
+    heading_norm = normalize_text(heading.upper())
+    for pattern in patterns:
+        if re.search(pattern, heading_norm, re.IGNORECASE):
+            return True
+    return False
+def get_table_context(tbl):
+    """Get comprehensive context information for table"""
+    heading = normalize_text(_prev_para_text(tbl))
+    headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
+    col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
+    first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
+    all_cells = []
+    for row in tbl.rows:
+        for cell in row.cells:
+            text = normalize_text(cell.text)
+            if text:
+                all_cells.append(text)
+    return {
+        'heading': heading,
+        'headers': headers,
+        'col0': col0,
+        'first_cell': first_cell,
+        'all_cells': all_cells,
+        'num_rows': len(tbl.rows),
+        'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
+    }
+def calculate_schema_match_score(schema_name, spec, context):
+    """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
+    score = 0
+    reasons = []
+    # 🎯 VEHICLE REGISTRATION BOOST
+    if "Vehicle Registration" in schema_name:
+        vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
+        if keyword_matches >= 2:
+            score += 150  # Very high boost for vehicle tables
+            reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
+        elif keyword_matches >= 1:
+            score += 75   # Medium boost
+            reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
+    # 🎯 SUMMARY TABLE BOOST (existing logic)
+    if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
+        score += 100
+        reasons.append(f"Summary schema with DETAILS column - perfect match")
+    if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
+        score -= 75
+        reasons.append(f"Non-summary schema penalized for DETAILS column presence")
+    # Context exclusions
+    if spec.get("context_exclusions"):
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        for exclusion in spec["context_exclusions"]:
+            if exclusion.lower() in table_text:
+                score -= 50
+                reasons.append(f"Context exclusion penalty: '{exclusion}' found")
+    # Context keywords
+    if spec.get("context_keywords"):
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        keyword_matches = 0
+        for keyword in spec["context_keywords"]:
+            if keyword.lower() in table_text:
+                keyword_matches += 1
+        if keyword_matches > 0:
+            score += keyword_matches * 15
+            reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
+    # Direct first cell match
+    if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
+        score += 100
+        reasons.append(f"Direct first cell match: '{context['first_cell']}'")
+    # Heading pattern matching
+    if spec.get("headings"):
+        for h in spec["headings"]:
+            if fuzzy_match_heading(context['heading'], [h["text"]]):
+                score += 50
+                reasons.append(f"Heading match: '{context['heading']}'")
+                break
+    # Column header matching
+    if spec.get("columns"):
+        cols = [normalize_text(col) for col in spec["columns"]]
+        matches = 0
+        for col in cols:
+            if any(col.upper() in h.upper() for h in context['headers']):
+                matches += 1
+        if matches == len(cols):
+            score += 60
+            reasons.append(f"All column headers match: {cols}")
+        elif matches > 0:
+            score += matches * 20
+            reasons.append(f"Partial column matches: {matches}/{len(cols)}")
+    # Label matching for left-oriented tables
+    if spec.get("orientation") == "left":
+        labels = [normalize_text(lbl) for lbl in spec["labels"]]
+        matches = 0
+        for lbl in labels:
+            if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
+                matches += 1
+        if matches > 0:
+            score += (matches / len(labels)) * 30
+            reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
+    # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
+    elif spec.get("orientation") == "row1":
+        labels = [normalize_text(lbl) for lbl in spec["labels"]]
+        matches = 0
+        for lbl in labels:
+            if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
+                matches += 1
+            elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
+                matches += 0.5  # Partial credit
+        if matches > 0:
+            score += (matches / len(labels)) * 40
+            reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
+    # Special handling for Declaration tables (existing logic)
+    if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
+        if "OPERATOR DECLARATION" in context['heading'].upper():
+            score += 80
+            reasons.append("Operator Declaration context match")
+        elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
+            score += 60
+            reasons.append("Manager found in cells (likely Operator Declaration)")
+    if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
+        if any("MANAGER" in cell.upper() for cell in context['all_cells']):
+            score -= 50
+            reasons.append("Penalty: Manager found (not auditor)")
+    return score, reasons
+def match_table_schema(tbl):
+    """Improved table schema matching with scoring system"""
+    context = get_table_context(tbl)
+    # Auditor Declaration first
+    if ("print name" in " ".join(context.get("headers", [])).lower() and
+        "auditor" in " ".join(context.get("headers", [])).lower()):
+        return "NHVAS Approved Auditor Declaration"
+    # NEW: prioritize Auditor Declaration to avoid misclassification
+    if looks_like_auditor_declaration(context):
+        return "NHVAS Approved Auditor Declaration"
+    # hard-match Operator Declaration first (high priority, avoids misclassification)
+    if looks_like_operator_declaration(context):
+        return "Operator Declaration"
+    best_match = None
+    best_score = 0
+    for name, spec in TABLE_SCHEMAS.items():
+        score, reasons = calculate_schema_match_score(name, spec, context)
+        if score > best_score:
+            best_score = score
+            best_match = name
+    if best_score >= 20:
+        return best_match
+    return None
+def check_multi_schema_table(tbl):
+    """Check if table contains multiple schemas and split appropriately"""
+    context = get_table_context(tbl)
+    operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
+                      "Australian Company Number", "NHVAS Manual"]
+    contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
+    has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
+    has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
+    if has_operator and has_contact:
+        return ["Operator Information", "Operator contact details"]
+    return None
+def extract_multi_schema_table(tbl, schemas):
+    """Extract data from table with multiple schemas"""
+    result = {}
+    for schema_name in schemas:
+        if schema_name not in TABLE_SCHEMAS:
+            continue
+        spec = TABLE_SCHEMAS[schema_name]
+        schema_data = {}
+        for ri, row in enumerate(tbl.rows):
+            if ri == 0:
+                continue
+            row_label = normalize_text(row.cells[0].text)
+            belongs_to_schema = False
+            matched_label = None
+            for spec_label in spec["labels"]:
+                spec_norm = normalize_text(spec_label).upper()
+                row_norm = row_label.upper()
+                if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
+                    belongs_to_schema = True
+                    matched_label = spec_label
+                    break
+            if not belongs_to_schema:
+                continue
+            for ci, cell in enumerate(row.cells):
+                red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
+                if red_txt:
+                    if matched_label not in schema_data:
+                        schema_data[matched_label] = []
+                    if red_txt not in schema_data[matched_label]:
+                        schema_data[matched_label].append(red_txt)
+        if schema_data:
+            result[schema_name] = schema_data
+    return result
+def extract_table_data(tbl, schema_name, spec):
+    """Extract red text data from table based on schema – per-row repeats for specific tables."""
+    # ───────────────────────────────────────────────────────────────────────────
+    # OPERATOR DECLARATION (row1 headers: Print Name | Position Title)
+    # ───────────────────────────────────────────────────────────────────────────
+    if schema_name == "Operator Declaration":
+        print(f"    🧾 EXTRACTION FIX: Processing Operator Declaration table")
+        labels = spec["labels"]  # ["Print Name", "Position Title"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Operator Declaration table has less than 2 rows")
+            return {}
+        # map header cells → labels (row1 orientation)
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # alias/canonical first
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # else bag-of-words similarity
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
+            else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        # collect red text from the (usually single) data row
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                if col_idx not in column_mapping:
+                    continue
+                label = column_mapping[col_idx]
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
+                print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                collected[label].append(red_txt)
+        result = {k: v for k, v in collected.items() if v}
+        print(f"    ✅ Operator Declaration extracted: {len(result)} columns with data")
+        return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # A) Vehicle Registration tables (per-row accumulation; NO dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
+    if "Vehicle Registration" in schema_name:
+        print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
+        labels = spec["labels"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}   # ← keep every row value
+        unmapped_bucket = {}
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Vehicle table has less than 2 rows")
+            return {}
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # Try alias/canonical first
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # Else bag-of-words similarity
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
+            else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+                unmapped_bucket[raw_h] = []
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        header_texts = [normalize_text(hc.text) for hc in header_row.cells]
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
+                if col_idx in column_mapping:
+                    label = column_mapping[col_idx]
+                    print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                    collected[label].append(red_txt)  # ← append every occurrence
+                else:
+                    header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
+                    unmapped_bucket.setdefault(header_name, []).append(red_txt)
+        result = {k: v for k, v in collected.items() if v}
+        if unmapped_bucket:
+            result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
+        print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
+        return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # B) Driver / Scheduler Records Examined (per-row accumulation; NO dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
+    if "Driver / Scheduler" in schema_name:
+        print(f"    👤 EXTRACTION FIX: Processing Driver / Scheduler table")
+        labels = spec["labels"]
+        canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
+        collected = {lbl: [] for lbl in labels}   # ← keep every row value
+        unmapped_bucket = {}
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Driver/Scheduler table has less than 2 rows")
+            return {}
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            raw_h = normalize_text(cell.text)
+            header_text = normalize_header_label(raw_h)
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{raw_h}'")
+            # Try alias/canonical first (rarely used here, but safe)
+            canon = canonicalize_label(header_text)
+            if canon in canonical_labels:
+                best_label = canonical_labels[canon]
+                print(f"        ✅ Mapped to: '{best_label}' (alias)")
+                column_mapping[col_idx] = best_label
+                continue
+            # Else bag-of-words similarity (good for long headings)
+            best_label, best_score = None, 0.0
+            for canon_lab, original_lab in canonical_labels.items():
+                s = bag_similarity(header_text, canon_lab)
+                if s > best_score:
+                    best_score, best_label = s, original_lab
+            if best_label and best_score >= 0.40:
+                print(f"        ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
+                column_mapping[col_idx] = best_label
+            else:
+                print(f"        ⚠️ No mapping found for '{raw_h}'")
+                unmapped_bucket[raw_h] = []
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        header_texts = [normalize_text(hc.text) for hc in header_row.cells]
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+                if not reds:
+                    continue
+                reds = coalesce_numeric_runs(reds)
+                red_txt = normalize_text(" ".join(reds))
+                if not red_txt:
+                    continue
+                if col_idx in column_mapping:
+                    label = column_mapping[col_idx]
+                    print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                    collected[label].append(red_txt)  # ← append every occurrence
+                else:
+                    header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
+                    unmapped_bucket.setdefault(header_name, []).append(red_txt)
+        result = {k: v for k, v in collected.items() if v}
+        if unmapped_bucket:
+            result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
+        print(f"    ✅ Driver / Scheduler extracted: {len(result)} columns with data")
+        return result
+    # ───────────────────────────────────────────────────────────────────────────
+    # C) Generic tables (unchanged: WITH dedupe)
+    # ───────────────────────────────────────────────────────────────────────────
+    labels = spec["labels"] + [schema_name]
+    collected = {lbl: [] for lbl in labels}
+    seen = {lbl: set() for lbl in labels}
+    by_col = (spec.get("orientation") == "row1")
+    start_row = 1 if by_col else 0
+    rows = tbl.rows[start_row:]
+    for ri, row in enumerate(rows):
+        for ci, cell in enumerate(row.cells):
+            reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
+            if not reds:
+                continue
+            reds = coalesce_numeric_runs(reds)
+            red_txt = normalize_text(" ".join(reds))
+            if not red_txt:
+                continue
+            if by_col:
+                if ci < len(spec["labels"]):
+                    lbl = spec["labels"][ci]
+                else:
+                    lbl = schema_name
+            else:
+                raw_label = normalize_text(row.cells[0].text)
+                lbl = None
+                for spec_label in spec["labels"]:
+                    if normalize_text(spec_label).upper() == raw_label.upper():
+                        lbl = spec_label
+                        break
+                if not lbl:
+                    a_raw = normalize_header_label(raw_label).upper()
+                    for spec_label in spec["labels"]:
+                        a_spec = normalize_header_label(spec_label).upper()
+                        if a_spec in a_raw or a_raw in a_spec:
+                            lbl = spec_label
+                            break
+                if not lbl:
+                    lbl = schema_name
+            if red_txt not in seen[lbl]:
+                seen[lbl].add(red_txt)
+                collected[lbl].append(red_txt)
+    return {k: v for k, v in collected.items() if v}
+def extract_red_text(input_doc):
+    # input_doc: docx.Document object or file path
+    if isinstance(input_doc, str):
+        doc = Document(input_doc)
+    else:
+        doc = input_doc
+    out = {}
+    table_count = 0
+    for tbl in doc.tables:
+        table_count += 1
+        multi_schemas = check_multi_schema_table(tbl)
+        if multi_schemas:
+            multi_data = extract_multi_schema_table(tbl, multi_schemas)
+            for schema_name, schema_data in multi_data.items():
+                if schema_data:
+                    if schema_name in out:
+                        for k, v in schema_data.items():
+                            if k in out[schema_name]:
+                                out[schema_name][k].extend(v)
+                            else:
+                                out[schema_name][k] = v
+                    else:
+                        out[schema_name] = schema_data
+            continue
+        schema = match_table_schema(tbl)
+        if not schema:
+            continue
+        spec = TABLE_SCHEMAS[schema]
+        data = extract_table_data(tbl, schema, spec)
+        if data:
+            if schema in out:
+                for k, v in data.items():
+                    if k in out[schema]:
+                        out[schema][k].extend(v)
+                    else:
+                        out[schema][k] = v
+            else:
+                out[schema] = data
+    # paragraphs (FIX: do not return early; build full 'paras' then attach)
+    paras = {}
+    for idx, para in enumerate(doc.paragraphs):
+        red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
+        if not red_txt:
+            continue
+        context = None
+        for j in range(idx-1, -1, -1):
+            txt = normalize_text(doc.paragraphs[j].text)
+            if txt:
+                all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
+                if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
+                    context = txt
+                    break
+        if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
+            context = "Date"
+        if not context:
+            context = "(para)"
+        paras.setdefault(context, []).append(red_txt)
+    if paras:
+        out["paragraphs"] = paras
+    # Fallback: ensure we capture the last-page Operator Declaration by headers
+    if "Operator Declaration" not in out:
+        op_dec = extract_operator_declaration_by_headers_from_end(doc)
+        if op_dec:
+            out["Operator Declaration"] = op_dec
+    return out
+def extract_red_text_filelike(input_file, output_file):
+    """
+    Accepts:
+      input_file: file-like object (BytesIO/File) or path
+      output_file: file-like object (opened for writing text) or path
+    """
+    if hasattr(input_file, "seek"):
+        input_file.seek(0)
+    doc = Document(input_file)
+    result = extract_red_text(doc)
+    if hasattr(output_file, "write"):
+        json.dump(result, output_file, indent=2, ensure_ascii=False)
+        output_file.flush()
+    else:
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+    return result
+if __name__ == "__main__":
+    # Support both script and app/file-like usage
+    if len(sys.argv) == 3:
+        input_docx = sys.argv[1]
+        output_json = sys.argv[2]
+        doc = Document(input_docx)
+        word_data = extract_red_text(doc)
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(word_data, f, indent=2, ensure_ascii=False)
+        print(json.dumps(word_data, indent=2, ensure_ascii=False))
+    else:
+        print("To use as a module: extract_red_text_filelike(input_file, output_file)")

space-pdf/master_key.py ADDED Viewed

	@@ -0,0 +1,372 @@

+"""
+Improved Master Key for NHVAS Audit extraction:
+- TABLE_SCHEMAS: Enhanced definitions with better matching criteria for Summary vs Basic tables
+- HEADING_PATTERNS: Improved regex patterns for main/sub headings
+- PARAGRAPH_PATTERNS: Enhanced patterns for key narrative sections
+"""
+# 1. Enhanced table schemas with better matching logic
+TABLE_SCHEMAS = {
+    "Tick as appropriate": {
+        "headings": [
+            {"level": 1, "text": "NHVAS Audit Summary Report"},
+        ],
+        "orientation": "left",
+        "labels": [
+            "Mass",
+            "Entry Audit",
+            "Maintenance",
+            "Initial Compliance Audit",
+            "Basic Fatigue",
+            "Compliance Audit",
+            "Advanced Fatigue",
+            "Spot Check",
+            "Triggered Audit"
+        ],
+        "priority": 90  # High priority for direct match
+    },
+    "Audit Information": {
+        "orientation": "left",
+        "labels": [
+            "Date of Audit",
+            "Location of audit",
+            "Auditor name",
+            "Audit Matrix Identifier (Name or Number)",
+            "Auditor Exemplar Global Reg No.",
+            "expiry Date:",
+            "NHVR Auditor Registration Number",
+            "expiry Date:"
+        ],
+        "priority": 80
+    },
+    "Operator Information": {
+        "headings": [
+            {"level": 1, "text": "Operator Information"}
+        ],
+        "orientation": "left",
+        "labels": [
+            "Operator name (Legal entity)",
+            "NHVAS Accreditation No. (If applicable)",
+            "Registered trading name/s",
+            "Australian Company Number",
+            "NHVAS Manual (Policies and Procedures) developed by"
+        ],
+        "priority": 85
+    },
+    "Operator contact details": {
+        "orientation": "left",
+        "labels": [
+            "Operator business address",
+            "Operator Postal address",
+            "Email address",
+            "Operator Telephone Number"
+        ],
+        "priority": 75,
+        "context_keywords": ["contact", "address", "email", "telephone"]
+    },
+    "Attendance List (Names and Position Titles)": {
+        "headings": [
+            {"level": 1, "text": "NHVAS Audit Summary Report"}
+        ],
+        "orientation": "row1",
+        "labels": ["Attendance List (Names and Position Titles)"],
+        "priority": 90
+    },
+    "Nature of the Operators Business (Summary)": {
+        "orientation": "row1",
+        "labels": ["Nature of the Operators Business (Summary):"],
+        "split_labels": ["Accreditation Number:", "Expiry Date:"],
+        "priority": 85
+    },
+    "Accreditation Vehicle Summary": {
+        "orientation": "left",
+        "labels": ["Number of powered vehicles", "Number of trailing vehicles"],
+        "priority": 80
+    },
+    "Accreditation Driver Summary": {
+        "orientation": "left",
+        "labels": ["Number of drivers in BFM", "Number of drivers in AFM"],
+        "priority": 80
+    },
+    "Compliance Codes": {
+        "orientation": "left",
+        "labels": ["V", "NC", "TNC", "SFI", "NAP", "NA"],
+        "priority": 70,
+        "context_exclusions": ["MASS MANAGEMENT", "MAINTENANCE MANAGEMENT", "FATIGUE MANAGEMENT"]
+    },
+    "Corrective Action Request Identification": {
+        "orientation": "row1",
+        "labels": ["Title", "Abbreviation", "Description"],
+        "priority": 80
+    },
+    # 🎯 BASIC MANAGEMENT SCHEMAS (Compliance Tables - Lower Priority)
+    "Maintenance Management": {
+        "headings": [
+            {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
+        ],
+        "orientation": "left",
+        "labels": [
+            "Std 1. Daily Check",
+            "Std 2. Fault Recording and Reporting",
+            "Std 3. Fault Repair",
+            "Std 4. Maintenance Schedules and Methods",
+            "Std 5. Records and Documentation",
+            "Std 6. Responsibilities",
+            "Std 7. Internal Review",
+            "Std 8. Training and Education"
+        ],
+        "priority": 60,
+        "context_keywords": ["maintenance"],
+        "context_exclusions": ["summary", "details", "audit findings"]  # Exclude Summary tables
+    },
+    "Mass Management": {
+        "headings": [
+            {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
+        ],
+        "orientation": "left",
+        "labels": [
+            "Std 1. Responsibilities",
+            "Std 2. Vehicle Control",
+            "Std 3. Vehicle Use",
+            "Std 4. Records and Documentation",
+            "Std 5. Verification",
+            "Std 6. Internal Review",
+            "Std 7. Training and Education",
+            "Std 8. Maintenance of Suspension"
+        ],
+        "priority": 60,
+        "context_keywords": ["mass"],
+        "context_exclusions": ["summary", "details", "audit findings"]  # Exclude Summary tables
+    },
+    "Fatigue Management": {
+        "headings": [
+            {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
+        ],
+        "orientation": "left",
+        "labels": [
+            "Std 1. Scheduling and Rostering",
+            "Std 2. Health and wellbeing for performed duty",
+            "Std 3. Training and Education",
+            "Std 4. Responsibilities and management practices",
+            "Std 5. Internal Review",
+            "Std 6. Records and Documentation",
+            "Std 7. Workplace conditions"
+        ],
+        "priority": 60,
+        "context_keywords": ["fatigue"],
+        "context_exclusions": ["summary", "details", "audit findings"]  # Exclude Summary tables
+    },
+    # 🎯 SUMMARY MANAGEMENT SCHEMAS (Detailed Tables with DETAILS column - Higher Priority)
+    "Maintenance Management Summary": {
+        "headings": [
+            {"level": 1, "text": "Audit Observations and Comments"},
+            {"level": 2, "text": "Maintenance Management Summary of Audit findings"}
+        ],
+        "orientation": "left",
+        "columns": ["MAINTENANCE MANAGEMENT", "DETAILS"],
+        "labels": [
+            "Std 1. Daily Check",
+            "Std 2. Fault Recording and Reporting",
+            "Std 3. Fault Repair",
+            "Std 4. Maintenance Schedules and Methods",
+            "Std 5. Records and Documentation",
+            "Std 6. Responsibilities",
+            "Std 7. Internal Review",
+            "Std 8. Training and Education"
+        ],
+        "priority": 85,  # Higher priority than basic Maintenance Management
+        "context_keywords": ["maintenance", "summary", "details", "audit findings"]
+    },
+    "Mass Management Summary": {
+        "headings": [
+            {"level": 1, "text": "Mass Management Summary of Audit findings"}
+        ],
+        "orientation": "left",
+        "columns": ["MASS MANAGEMENT", "DETAILS"],
+        "labels": [
+            "Std 1. Responsibilities",
+            "Std 2. Vehicle Control",
+            "Std 3. Vehicle Use",
+            "Std 4. Records and Documentation",
+            "Std 5. Verification",
+            "Std 6. Internal Review",
+            "Std 7. Training and Education",
+            "Std 8. Maintenance of Suspension"
+        ],
+        "priority": 85,  # Higher priority than basic Mass Management
+        "context_keywords": ["mass", "summary", "details", "audit findings"]
+    },
+    "Fatigue Management Summary": {
+        "headings": [
+            {"level": 1, "text": "Fatigue Management Summary of Audit findings"}
+        ],
+        "orientation": "left",
+        "columns": ["FATIGUE MANAGEMENT", "DETAILS"],
+        "labels": [
+            "Std 1. Scheduling and Rostering",
+            "Std 2. Health and wellbeing for performed duty",
+            "Std 3. Training and Education",
+            "Std 4. Responsibilities and management practices",
+            "Std 5. Internal Review",
+            "Std 6. Records and Documentation",
+            "Std 7. Workplace conditions"
+        ],
+        "priority": 85,  # Higher priority than basic Fatigue Management
+        "context_keywords": ["fatigue", "summary", "details", "audit findings"]
+    },
+    # Vehicle Registration Tables
+    "Vehicle Registration Numbers Mass": {
+    "headings": [
+        {"level": 1, "text": "Vehicle Registration Numbers of Records Examined"},
+        {"level": 2, "text": "MASS MANAGEMENT"}
+    ],
+    "orientation": "row1",
+    "labels": [
+        "No.", "Registration Number", "Sub contractor",
+        "Sub-contracted Vehicles Statement of Compliance",
+        "Weight Verification Records",
+        "RFS Suspension Certification #",
+        "Suspension System Maintenance", "Trip Records",
+        "Fault Recording/ Reporting on Suspension System"
+    ],
+    "priority": 90,  # Higher priority
+    "context_keywords": ["mass", "vehicle registration", "rfs suspension", "weight verification"],
+    "context_exclusions": ["maintenance", "roadworthiness", "daily checks"]  # Exclude maintenance-specific terms
+},
+"Vehicle Registration Numbers Maintenance": {
+    "headings": [
+        {"level": 1, "text": "Vehicle Registration Numbers of Records Examined"},
+        {"level": 2, "text": "Maintenance Management"}
+    ],
+    "orientation": "row1",
+    "labels": [
+        "No.", "Registration Number", "Roadworthiness Certificates",
+        "Maintenance Records", "Daily Checks",
+        "Fault Recording/ Reporting", "Fault Repair"
+    ],
+    "priority": 85,  # Lower priority
+    "context_keywords": ["maintenance", "vehicle registration", "roadworthiness", "daily checks"],
+    "context_exclusions": ["mass", "rfs suspension", "weight verification"]  # Exclude mass-specific terms
+},
+    "Driver / Scheduler Records Examined": {
+        "headings": [
+            {"level": 1, "text": "Driver / Scheduler Records Examined"},
+            {"level": 2, "text": "FATIGUE MANAGEMENT"},
+        ],
+        "orientation": "row1",
+        "labels": [
+            "No.",
+            "Driver / Scheduler Name",
+            "Driver TLIF Course # Completed",
+            "Scheduler TLIF Course # Completed",
+            "Medical Certificates (Current Yes/No) Date of expiry",
+            "Roster / Schedule / Safe Driving Plan (Date Range)",
+            "Fit for Duty Statement Completed (Yes/No)",
+            "Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"
+        ],
+        "priority": 80,
+        "context_keywords": ["driver", "scheduler", "fatigue"]
+    },
+    # Other Tables
+    "Operator's Name (legal entity)": {
+        "headings": [
+            {"level": 1, "text": "CORRECTIVE ACTION REQUEST (CAR)"}
+        ],
+        "orientation": "left",
+        "labels": ["Operator's Name (legal entity)"],
+        "priority": 85
+    },
+    "Non-conformance and CAR details": {
+        "orientation": "left",
+        "labels": [
+            "Non-conformance agreed close out date",
+            "Module and Standard",
+            "Corrective Action Request (CAR) Number",
+            "Observed Non-conformance:",
+            "Corrective Action taken or to be taken by operator:",
+            "Operator or Representative Signature",
+            "Position",
+            "Date",
+            "Comments:",
+            "Auditor signature",
+            "Date"
+        ],
+        "priority": 75,
+        "context_keywords": ["non-conformance", "corrective action"]
+    },
+    "NHVAS Approved Auditor Declaration": {
+        "headings": [
+            {"level": 1, "text": "NHVAS APPROVED AUDITOR DECLARATION"}
+        ],
+        "orientation": "row1",
+        "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
+        "priority": 90,
+        "context_keywords": ["auditor declaration", "NHVR"],
+        "context_exclusions": ["manager", "operator declaration"]
+    },
+    "Audit Declaration dates": {
+        "headings": [
+            {"level": 1, "text": "Audit Declaration dates"}
+        ],
+        "orientation": "left",
+        "labels": [
+            "Audit was conducted on",
+            "Unconditional CARs closed out on:",
+            "Conditional CARs to be closed out by:"
+        ],
+        "priority": 80
+    },
+    "Print accreditation name": {
+        "headings": [
+            {"level": 1, "text": "(print accreditation name)"}
+        ],
+        "orientation": "left",
+        "labels": ["(print accreditation name)"],
+        "priority": 85
+    },
+    "Operator Declaration": {
+        "headings": [
+            {"level": 1, "text": "Operator Declaration"}
+        ],
+        "orientation": "row1",
+        "labels": ["Print Name", "Position Title"],
+        "priority": 90,
+        "context_keywords": ["operator declaration", "manager"],
+        "context_exclusions": ["auditor", "nhvas approved"]
+    }
+}
+# 2. Enhanced heading detection patterns
+HEADING_PATTERNS = {
+    "main": [
+        r"NHVAS\s+Audit\s+Summary\s+Report",
+        r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
+        r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
+    ],
+    "sub": [
+        r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
+        r"MAINTENANCE\s+MANAGEMENT",
+        r"MASS\s+MANAGEMENT",
+        r"FATIGUE\s+MANAGEMENT",
+        r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
+        r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
+        r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
+        r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
+        r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
+        r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
+        r"Operator\s+Declaration",
+        r"Operator\s+Information"
+    ]
+}
+# 3. Enhanced paragraph patterns for key narrative sections
+PARAGRAPH_PATTERNS = {
+    "findings_summary": r"Provide a summary of findings based on the evidence gathered during the audit\.",
+    "declaration_text": r"I hereby acknowledge and agree with the findings.*",
+    "introductory_note": r"This audit assesses the.*",
+    "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
+}

space-pdf/packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ poppler-utils
2	+ tesseract-ocr

space-pdf/requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+fastapi==0.111.1
+pydantic==2.11.0
+python-multipart==0.0.9
+uvicorn==0.30.3
+gunicorn==22.0.0
+requests==2.32.3
+torch==2.4.0
+torchvision==0.19.0
+Pillow==10.4.0
+pdf-annotate==0.12.0
+scipy==1.14.0
+opencv-python==4.10.0.84
+Shapely==2.0.5
+transformers==4.40.2
+huggingface_hub==0.23.5
+pdf2image==1.17.0
+lightgbm==4.5.0
+setuptools==75.4.0
+roman==4.2
+hydra-core==1.3.2
+pypandoc==1.13
+rapid-table==2.0.3
+rapidocr==3.2.0
+pix2tex==0.1.4
+latex2mathml==3.78.0
+PyMuPDF==1.25.5
+git+https://github.com/huridocs/pdf-features.git@2025.7.30.1
+gradio==4.44.1
+pytesseract
+python-docx
+camelot-py[cv]        # for digital-table parsing
+pdf2image             # for fallback OCR on images
+pytesseract
+Pillow
+rapidfuzz
+pdfplumber
+openai

space-pdf/update_docx_with_pdf.py ADDED Viewed

	@@ -0,0 +1,1470 @@

+#!/usr/bin/env python3
+"""
+Enhanced NHVAS PDF to DOCX JSON Merger
+Comprehensive extraction and mapping from PDF to DOCX structure
+(keep pipeline intact; fix spacing, operator info mapping, vehicle-reg header mapping, date fallback)
+"""
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from collections import OrderedDict  # <-- add this
+def _nz(x):
+    return x if isinstance(x, str) and x.strip() else ""
+SUMMARY_SECTIONS = {
+    "MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
+    "MASS MANAGEMENT": "Mass Management Summary",
+    "FATIGUE MANAGEMENT": "Fatigue Management Summary",
+}
+# ───────────────────────────── helpers: text cleanup & label matching ─────────────────────────────
+def _canon_header(s: str) -> str:
+    if not s: return ""
+    s = re.sub(r"\s+", " ", str(s)).strip().lower()
+    s = s.replace("–", "-").replace("—", "-")
+    s = re.sub(r"[/]+", " / ", s)
+    s = re.sub(r"[^a-z0-9#/ ]+", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+# Header aliases -> internal keys we already use later during mapping
+_VEH_HEADER_ALIASES = {
+    # common
+    "registration number": "registration",
+    "reg no": "registration",
+    "reg.#": "registration",
+    "no.": "no",
+    "no": "no",
+    # maintenance table
+    "roadworthiness certificates": "roadworthiness",
+    "maintenance records": "maintenance_records",
+    "daily checks": "daily_checks",
+    "fault recording reporting": "fault_recording",
+    "fault recording / reporting": "fault_recording",
+    "fault repair": "fault_repair",
+    # mass table
+    "sub contractor": "sub_contractor",
+    "sub-contractor": "sub_contractor",
+    "sub contracted vehicles statement of compliance": "sub_comp",
+    "sub-contracted vehicles statement of compliance": "sub_comp",
+    "weight verification records": "weight_verification",
+    "rfs suspension certification #": "rfs_certification",
+    "rfs suspension certification number": "rfs_certification",
+    "suspension system maintenance": "suspension_maintenance",
+    "trip records": "trip_records",
+    "fault recording reporting on suspension system": "fault_reporting_suspension",
+    "fault recording / reporting on suspension system": "fault_reporting_suspension",
+}
+# --- helpers ---
+def build_vehicle_sections(extracted: dict) -> dict:
+    """Build arrays for Maintenance and Mass tables. Maintenance uses recorded rows to include ALL entries."""
+    maint = {
+        "Registration Number": [],
+        "Roadworthiness Certificates": [],
+        "Maintenance Records": [],
+        "Daily Checks": [],
+        "Fault Recording/ Reporting": [],
+        "Fault Repair": [],
+    }
+    mass = {
+        "Registration Number": [],
+        "Weight Verification Records": [],
+        "RFS Suspension Certification #": [],
+        "Suspension System Maintenance": [],
+        "Trip Records": [],
+        "Fault Recording/ Reporting on Suspension System": [],
+    }
+    # Prefer authoritative maintenance rows captured during parsing (spans all pages)
+    if extracted.get("_maint_rows"):
+        for row in extracted["_maint_rows"]:
+            maint["Registration Number"].append(_smart_space(row.get("registration", "")))
+            maint["Roadworthiness Certificates"].append(_nz(row.get("roadworthiness", "")))
+            maint["Maintenance Records"].append(_nz(row.get("maintenance_records", "")))
+            maint["Daily Checks"].append(_nz(row.get("daily_checks", "")))
+            maint["Fault Recording/ Reporting"].append(_nz(row.get("fault_recording", "")))
+            maint["Fault Repair"].append(_nz(row.get("fault_repair", "")))
+    else:
+        # Fallback to vehicles map (older behavior)
+        for v in extracted.get("vehicles", []) or []:
+            if not v.get("registration"): continue
+            if v.get("seen_in_maintenance") or any(v.get(k) for k in ["roadworthiness","maintenance_records","daily_checks","fault_recording","fault_repair"]):
+                rw = _nz(v.get("roadworthiness", "")); mr = _nz(v.get("maintenance_records", "")); dc = _nz(v.get("daily_checks", ""))
+                fr = _nz(v.get("fault_recording", "")); rp = _nz(v.get("fault_repair", ""))
+                if not mr and dc: mr = dc
+                if not rp and fr: rp = fr
+                if not fr and rp: fr = rp
+                maint["Registration Number"].append(_smart_space(v["registration"]))
+                maint["Roadworthiness Certificates"].append(rw)
+                maint["Maintenance Records"].append(mr)
+                maint["Daily Checks"].append(dc)
+                maint["Fault Recording/ Reporting"].append(fr)
+                maint["Fault Repair"].append(rp)
+    # Mass stays as-is (from vehicles)
+    for v in extracted.get("vehicles", []) or []:
+        if not v.get("registration"): continue
+        if v.get("seen_in_mass") or any(v.get(k) for k in ["weight_verification","rfs_certification","suspension_maintenance","trip_records","fault_reporting_suspension"]):
+            mass["Registration Number"].append(_smart_space(v["registration"]))
+            mass["Weight Verification Records"].append(_nz(v.get("weight_verification", "")))
+            mass["RFS Suspension Certification #"].append(_nz(v.get("rfs_certification", "")))
+            mass["Suspension System Maintenance"].append(_nz(v.get("suspension_maintenance", "")))
+            mass["Trip Records"].append(_nz(v.get("trip_records", "")))
+            mass["Fault Recording/ Reporting on Suspension System"].append(_nz(v.get("fault_reporting_suspension", "")))
+    return {
+        "Vehicle Registration Numbers Maintenance": maint,
+        "Vehicle Registration Numbers Mass": mass,
+    }
+def _map_header_indices(headers: list[str]) -> dict:
+    """Return {internal_key: column_index} by matching/aliasing header text."""
+    idx = {}
+    for i, h in enumerate(headers or []):
+        ch = _canon_header(h)
+        # try direct alias
+        if ch in _VEH_HEADER_ALIASES:
+            idx[_VEH_HEADER_ALIASES[ch]] = i
+            continue
+        # relax a little for 'registration number' variants
+        if "registration" in ch and "number" in ch:
+            idx["registration"] = i
+            continue
+        if "roadworthiness" in ch:
+            idx["roadworthiness"] = i
+            continue
+        if "maintenance" in ch and "records" in ch:
+            idx["maintenance_records"] = i
+            continue
+        if "daily" in ch and "check" in ch:
+            idx["daily_checks"] = i
+            continue
+        if "fault" in ch and "record" in ch and "suspension" not in ch:
+            # maintenance fault-recording column
+            if "repair" in ch:
+                idx["fault_repair"] = i
+            else:
+                idx["fault_recording"] = i
+            continue
+        if "weight" in ch and "verification" in ch:
+            idx["weight_verification"] = i
+            continue
+        if "rfs" in ch and "certification" in ch:
+            idx["rfs_certification"] = i
+            continue
+        if "suspension" in ch and "maintenance" in ch:
+            idx["suspension_maintenance"] = i
+            continue
+        if "trip" in ch and "record" in ch:
+            idx["trip_records"] = i
+            continue
+        if "fault" in ch and "report" in ch and "suspension" in ch:
+            idx["fault_reporting_suspension"] = i
+            continue
+    return idx
+def _canon(s: str) -> str:
+    if not s: return ""
+    s = re.sub(r"\s+", " ", str(s)).strip().lower()
+    s = re.sub(r"[^a-z0-9#]+", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+def _smart_space(s: str) -> str:
+    if not s: return s
+    s = str(s)
+    # Insert spaces at typical OCR glue points
+    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
+    s = re.sub(r'([A-Za-z])(\d)', r'\1 \2', s)
+    s = re.sub(r'(\d)([A-Za-z])', r'\1 \2', s)
+    s = re.sub(r'([A-Z]{2,})(\d)', r'\1 \2', s)
+    # Fix common glued tokens
+    s = s.replace("POBox", "PO Box")
+    # Compact ordinals back together: "9 th" -> "9th", but preserve a space after the ordinal if followed by a word
+    s = re.sub(r'\b(\d+)\s*(st|nd|rd|th)\b', r'\1\2', s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def looks_like_plate(s: str) -> bool:
+    if not s: return False
+    t = re.sub(r"[\s-]", "", str(s).upper())
+    if not (5 <= len(t) <= 8): return False
+    if not re.fullmatch(r"[A-Z0-9]+", t): return False
+    if sum(c.isalpha() for c in t) < 2: return False
+    if sum(c.isdigit() for c in t) < 2: return False
+    if t in {"ENTRY","YES","NO","N/A","NA"}: return False
+    return True
+def is_dateish(s: str) -> bool:
+    if not s: return False
+    s = _smart_space(s)
+    # tokens like 03/22, 20/02/2023, 01.02.21, 2023-02-20
+    return bool(re.search(r"\b\d{1,4}(?:[./-]\d{1,2}){1,2}\b", s))
+def extract_date_tokens(s: str) -> list[str]:
+    if not s: return []
+    s = _smart_space(s)
+    return re.findall(r"\b\d{1,4}(?:[./-]\d{1,2}){1,2}\b", s)
+def _clean_list(vals: List[str]) -> List[str]:
+    out = []
+    for v in vals:
+        v = _smart_space(v)
+        if v:
+            out.append(v)
+    return out
+def _looks_like_manual_value(s: str) -> bool:
+    if not s: return False
+    s = s.strip()
+    # reject pure digits (e.g., "51902") and very short tokens
+    if re.fullmatch(r"\d{3,}", s):
+        return False
+    # accept if it has any letters or typical version hints
+    return bool(re.search(r"[A-Za-z]", s))
+def _looks_like_company(s: str) -> bool:
+    """Very light validation to avoid capturing labels as values."""
+    if not s: return False
+    s = _smart_space(s)
+    # at least two words containing letters (e.g., "Kangaroo Transport")
+    return bool(re.search(r"[A-Za-z]{2,}\s+[A-Za-z&]{2,}", s))
+# ───────────────────────────── label index (non-summary only; no values) ─────────────────────────────
+LABEL_INDEX: Dict[str, Dict[str, Dict[str, Any]]] = {
+    "Audit Information": {
+        "Date of Audit": {"alts": ["Date of Audit"]},
+        "Location of audit": {"alts": ["Location of audit", "Location"]},
+        "Auditor name": {"alts": ["Auditor name", "Auditor"]},
+        "Audit Matrix Identifier (Name or Number)": {"alts": ["Audit Matrix Identifier (Name or Number)", "Audit Matrix Identifier"]},
+        "Auditor Exemplar Global Reg No.": {"alts": ["Auditor Exemplar Global Reg No."]},
+        "NHVR Auditor Registration Number": {"alts": ["NHVR Auditor Registration Number"]},
+        "expiry Date:": {"alts": ["expiry Date:", "Expiry Date:"]},
+    },
+    "Operator Information": {
+        "Operator name (Legal entity)": {"alts": ["Operator name (Legal entity)", "Operator's Name (legal entity)"]},
+        "NHVAS Accreditation No. (If applicable)": {"alts": ["NHVAS Accreditation No. (If applicable)", "NHVAS Accreditation No."]},
+        "Registered trading name/s": {"alts": ["Registered trading name/s", "Trading name/s"]},
+        "Australian Company Number": {"alts": ["Australian Company Number", "ACN"]},
+        "NHVAS Manual (Policies and Procedures) developed by": {"alts": [
+            "NHVAS Manual (Policies and Procedures) developed by",
+            "NHVAS Manual developed by",
+            "Manual developed by"
+        ]},
+    },
+    "Operator contact details": {
+        "Operator business address": {"alts": ["Operator business address", "Business address"]},
+        "Operator Postal address": {"alts": ["Operator Postal address", "Postal address"]},
+        "Email address": {"alts": ["Email address", "Email"]},
+        "Operator Telephone Number": {"alts": ["Operator Telephone Number", "Telephone", "Phone"]},
+    },
+    "Attendance List (Names and Position Titles)": {
+        "Attendance List (Names and Position Titles)": {"alts": ["Attendance List (Names and Position Titles)", "Attendance List"]},
+    },
+    "Nature of the Operators Business (Summary)": {
+        "Nature of the Operators Business (Summary):": {"alts": ["Nature of the Operators Business (Summary):"]},
+    },
+    "Accreditation Vehicle Summary": {
+        "Number of powered vehicles": {"alts": ["Number of powered vehicles"]},
+        "Number of trailing vehicles": {"alts": ["Number of trailing vehicles"]},
+    },
+    "Accreditation Driver Summary": {
+        "Number of drivers in BFM": {"alts": ["Number of drivers in BFM"]},
+        "Number of drivers in AFM": {"alts": ["Number of drivers in AFM"]},
+    },
+    "Vehicle Registration Numbers Maintenance": {
+        "No.": {"alts": ["No.", "No"]},
+        "Registration Number": {"alts": ["Registration Number", "Registration"]},
+        "Roadworthiness Certificates": {"alts": ["Roadworthiness Certificates", "Roadworthiness"]},
+        "Maintenance Records": {"alts": ["Maintenance Records"]},
+        "Daily Checks": {"alts": ["Daily Checks", "Daily Check"]},
+        "Fault Recording/ Reporting": {"alts": ["Fault Recording/ Reporting", "Fault Recording / Reporting"]},
+        "Fault Repair": {"alts": ["Fault Repair"]},
+    },
+    "Vehicle Registration Numbers Mass": {
+        "No.": {"alts": ["No.", "No"]},
+        "Registration Number": {"alts": ["Registration Number", "Registration"]},
+        "Sub contractor": {"alts": ["Sub contractor", "Sub-contractor"]},
+        "Sub-contracted Vehicles Statement of Compliance": {"alts": ["Sub-contracted Vehicles Statement of Compliance"]},
+        "Weight Verification Records": {"alts": ["Weight Verification Records"]},
+        "RFS Suspension Certification #": {"alts": ["RFS Suspension Certification #", "RFS Suspension Certification Number"]},
+        "Suspension System Maintenance": {"alts": ["Suspension System Maintenance"]},
+        "Trip Records": {"alts": ["Trip Records"]},
+        "Fault Recording/ Reporting on Suspension System": {"alts": ["Fault Recording/ Reporting on Suspension System"]},
+    },
+    "Driver / Scheduler Records Examined": {
+        "No.": {"alts": ["No.", "No"]},
+        "Driver / Scheduler Name": {"alts": ["Driver / Scheduler Name"]},
+        "Driver TLIF Course # Completed": {"alts": ["Driver TLIF Course # Completed"]},
+        "Scheduler TLIF Course # Completed": {"alts": ["Scheduler TLIF Course # Completed"]},
+        "Medical Certificates (Current Yes/No) Date of expiry": {"alts": ["Medical Certificates (Current Yes/No) Date of expiry"]},
+        "Roster / Schedule / Safe Driving Plan (Date Range)": {"alts": ["Roster / Schedule / Safe Driving Plan (Date Range)"]},
+        "Fit for Duty Statement Completed (Yes/No)": {"alts": ["Fit for Duty Statement Completed (Yes/No)"]},
+        "Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)": {"alts": ["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"]},
+    },
+    "NHVAS Approved Auditor Declaration": {
+        "Print Name": {"alts": ["Print Name"]},
+        "NHVR or Exemplar Global Auditor Registration Number": {"alts": ["NHVR or Exemplar Global Auditor Registration Number"]},
+    },
+    "Audit Declaration dates": {
+        "Audit was conducted on": {"alts": ["Audit was conducted on"]},
+        "Unconditional CARs closed out on:": {"alts": ["Unconditional CARs closed out on:"]},
+        "Conditional CARs to be closed out by:": {"alts": ["Conditional CARs to be closed out by:"]},
+    },
+    "Print accreditation name": {
+        "(print accreditation name)": {"alts": ["(print accreditation name)"]},
+    },
+    "Operator Declaration": {
+        "Print Name": {"alts": ["Print Name"]},
+        "Position Title": {"alts": ["Position Title"]},
+    },
+}
+class NHVASMerger:
+    def __init__(self):
+        self.debug_mode = True
+        self._vehicle_by_reg = OrderedDict()
+    def log_debug(self, msg: str):
+        if self.debug_mode:
+            print(f"🔍 {msg}")
+    def normalize_std_label(self, label: str) -> str:
+        if not label: return ""
+        base = re.sub(r"\([^)]*\)", "", label)
+        base = re.sub(r"\s+", " ", base).strip()
+        m = re.match(r"^(Std\s*\d+\.\s*[^:]+?)\s*$", base, flags=re.IGNORECASE)
+        return m.group(1).strip() if m else base
+    def _pick_nearby(self, row, anchor_idx: int | None, want: str = "plate", window: int = 3) -> str:
+        """Return the best cell for a field by looking at the anchor index and nearby columns.
+        want ∈ {"plate","date","rf","yn"}"""
+        def cell(i):
+            if i is None or i < 0 or i >= len(row): return ""
+            v = row[i]
+            return v.strip() if isinstance(v, str) else str(v).strip()
+        # 1) try the anchor cell
+        cand = cell(anchor_idx)
+        if want == "plate" and looks_like_plate(cand): return _smart_space(cand)
+        if want == "date"  and is_dateish(cand):      return _smart_space(cand)
+        if want == "rf"    and re.search(r"\bRF\s*\d+\b", cand, re.I): return _smart_space(re.search(r"\bRF\s*\d+\b", cand, re.I).group(0))
+        if want == "yn"    and cand.strip().lower() in {"yes","no"}:   return cand.strip().title()
+        # 2) scan a window around the anchor
+        if anchor_idx is not None:
+            for offset in range(1, window+1):
+                for i in (anchor_idx - offset, anchor_idx + offset):
+                    c = cell(i)
+                    if not c: continue
+                    if want == "plate" and looks_like_plate(c): return _smart_space(c)
+                    if want == "date"  and is_dateish(c):      return _smart_space(c)
+                    if want == "rf":
+                        m = re.search(r"\bRF\s*\d+\b", c, re.I)
+                        if m: return _smart_space(m.group(0))
+                    if want == "yn" and c.strip().lower() in {"yes","no"}: return c.strip().title()
+        # 3) last resort: scan whole row
+        joined = " ".join(str(c or "") for c in row)
+        if want == "plate":
+            for tok in joined.split():
+                if looks_like_plate(tok): return _smart_space(tok)
+        if want == "date":
+            tok = extract_date_tokens(joined)
+            return tok[0] if tok else ""
+        if want == "rf":
+            m = re.search(r"\bRF\s*\d+\b", joined, re.I)
+            return _smart_space(m.group(0)) if m else ""
+        if want == "yn":
+            j = f" {joined.lower()} "
+            if " yes " in j: return "Yes"
+            if " no "  in j: return "No"
+        return ""
+    def _force_fill_maintenance_from_tables(self, pdf_data: Dict, merged: Dict) -> None:
+        """Overwrite Maintenance arrays by scanning ALL maintenance tables across pages."""
+        maint = merged.get("Vehicle Registration Numbers Maintenance")
+        if not isinstance(maint, dict):
+            return
+        tables = (pdf_data.get("extracted_data") or {}).get("all_tables") or []
+        regs, rw, mr, dc, fr, rp = [], [], [], [], [], []
+        for t in tables:
+            hdrs = [_canon_header(h or "") for h in t.get("headers") or []]
+            if not hdrs:
+                continue
+            # detect a maintenance table
+            txt = " ".join(hdrs)
+            if ("registration" not in txt) or not any(
+                k in txt for k in ["maintenance records", "daily", "fault recording", "fault repair", "roadworthiness"]
+            ):
+                continue
+            def fidx(pred):
+                for i, h in enumerate(hdrs):
+                    if pred(h):
+                        return i
+                return None
+            reg_i   = fidx(lambda h: "registration" in h)
+            rw_i    = fidx(lambda h: "roadworthiness" in h)
+            mr_i    = fidx(lambda h: "maintenance" in h and "record" in h)
+            dc_i    = fidx(lambda h: "daily" in h and "check" in h)
+            fr_i    = fidx(lambda h: "fault" in h and "record" in h and "suspension" not in h)
+            rp_i    = fidx(lambda h: "fault" in h and "repair" in h)
+            for r in t.get("data") or []:
+                def cell(i):
+                    if i is None or i >= len(r): return ""
+                    v = r[i]
+                    return v.strip() if isinstance(v, str) else str(v).strip()
+                plate = _smart_space(cell(reg_i))
+                if not plate or not looks_like_plate(plate):
+                    continue
+                v_rw = _nz(cell(rw_i))
+                v_mr = _nz(cell(mr_i))
+                v_dc = _nz(cell(dc_i))
+                v_fr = _nz(cell(fr_i))
+                v_rp = _nz(cell(rp_i))
+                # sensible fallbacks
+                if not v_mr and v_dc: v_mr = v_dc
+                if not v_rp and v_fr: v_rp = v_fr
+                if not v_fr and v_rp: v_fr = v_rp
+                regs.append(plate); rw.append(v_rw); mr.append(v_mr)
+                dc.append(v_dc);    fr.append(v_fr); rp.append(v_rp)
+        if regs:  # overwrite arrays only if we found rows
+            maint["Registration Number"] = regs
+            maint["Roadworthiness Certificates"] = rw
+            maint["Maintenance Records"] = mr
+            maint["Daily Checks"] = dc
+            maint["Fault Recording/ Reporting"] = fr
+            maint["Fault Repair"] = rp
+    def _collapse_multiline_headers(self, headers: List[str], data_rows: List[List[str]]):
+        """
+        Merge header continuation rows (when first data rows are not numeric '1.', '2.', …)
+        into the main headers, then return (merged_headers, remaining_data_rows).
+        """
+        merged = [_smart_space(h or "") for h in (headers or [])]
+        consumed = 0
+        header_frags: List[List[str]] = []
+        # Collect up to 5 leading rows that look like header fragments
+        for r in data_rows[:5]:
+            first = (str(r[0]).strip() if r else "")
+            if re.match(r"^\d+\.?$", first):
+                break  # real data starts
+            consumed += 1
+            header_frags.append(r)
+        # Merge every collected fragment row into merged
+        for frag in header_frags:
+            for i, cell in enumerate(frag):
+                cell_txt = _smart_space(str(cell or "").strip())
+                if not cell_txt:
+                    continue
+                if i >= len(merged):
+                    merged.append(cell_txt)
+                else:
+                    merged[i] = (merged[i] + " " + cell_txt).strip()
+        return merged, data_rows[consumed:]
+    def _first_attendance_name_title(self, att_list: List[str]) -> Optional[tuple[str, str]]:
+        """Return (print_name, position_title) from the first 'Name - Title' in attendance."""
+        if not att_list:
+            return None
+        # First "Name - Title", stop before next "Name -"
+        pat = re.compile(
+            r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})\s*-\s*(.*?)(?=(?:\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3}\s*-\s*)|$)'
+        )
+        for item in att_list:
+            s = _smart_space(str(item))
+            m = pat.search(s)
+            if m:
+                name = _smart_space(m.group(1))
+                title = _smart_space(m.group(2))
+                return name, title
+        return None
+    # ───────────────────────────── summary tables (unchanged logic) ─────────────────────────────
+    def build_summary_maps(self, pdf_json: dict) -> dict:
+        out = {v: {} for v in SUMMARY_SECTIONS.values()}
+        try:
+            tables = pdf_json["extracted_data"]["all_tables"]
+        except Exception:
+            return out
+        for t in tables:
+            headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
+            if "DETAILS" not in headers:
+                continue
+            section_key_raw = next((h for h in headers if h in SUMMARY_SECTIONS), None)
+            if not section_key_raw:
+                continue
+            section_name = SUMMARY_SECTIONS[section_key_raw]
+            for row in t.get("data", []):
+                if not row: continue
+                left = str(row[0]) if len(row) >= 1 else ""
+                right = str(row[1]) if len(row) >= 2 else ""
+                left_norm = self.normalize_std_label(left)
+                if left_norm and right:
+                    prev = out[section_name].get(left_norm, "")
+                    merged_text = (prev + " " + right).strip() if prev else right.strip()
+                    out[section_name][left_norm] = merged_text
+        for sec in out:
+            out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
+        return out
+    # ───────────────────────────── NEW: find cell by label in tables ─────────────────────────────
+    def _find_table_value(self, tables: List[Dict], label_variants: List[str]) -> Optional[str]:
+        targets = {_canon(v) for v in label_variants}
+        for t in tables:
+            data = t.get("data", [])
+            if not data: continue
+            for row in data:
+                if not row: continue
+                key = _canon(str(row[0]))
+                if key in targets:
+                    vals = [str(c).strip() for c in row[1:] if str(c).strip()]
+                    if vals:
+                        return _smart_space(" ".join(vals))
+        return None
+    # ───────────────────────────── comprehensive extraction (minimal changes) ─────────────────────────────
+    def extract_from_pdf_comprehensive(self, pdf_data: Dict) -> Dict[str, Any]:
+        self._vehicle_by_reg.clear()
+        extracted = {}
+        extracted_data = pdf_data.get("extracted_data", {})
+        tables = extracted_data.get("all_tables", [])
+        # Capture "Audit was conducted on" from tables; ignore placeholder "Date"
+        awd = self._find_table_value(
+            tables,
+            LABEL_INDEX["Audit Declaration dates"]["Audit was conducted on"]["alts"]
+        )
+        if awd:
+            awd = _smart_space(awd)
+            if re.search(r"\d", awd) and not re.fullmatch(r"date", awd, re.I):
+                extracted["audit_conducted_date"] = awd
+        # 1) Audit Information (table first)
+        audit_info = extracted_data.get("audit_information", {})
+        if audit_info:
+            extracted["audit_info"] = {
+                "date_of_audit": _smart_space(audit_info.get("DateofAudit", "")),
+                "location": _smart_space(audit_info.get("Locationofaudit", "")),
+                "auditor_name": _smart_space(audit_info.get("Auditorname", "")),
+                "matrix_id": _smart_space(audit_info.get("AuditMatrixIdentifier (Name or Number)", "")),
+            }
+        # If missing, try generic table lookup
+        for label, meta in LABEL_INDEX.get("Audit Information", {}).items():
+            if label == "expiry Date:":  # not used in your DOCX example
+                continue
+            val = self._find_table_value(tables, meta.get("alts", [label]))
+            if val:
+                extracted.setdefault("audit_info", {})
+                if _canon(label) == _canon("Date of Audit"): extracted["audit_info"]["date_of_audit"] = val
+                elif _canon(label) == _canon("Location of audit"): extracted["audit_info"]["location"] = val
+                elif _canon(label) == _canon("Auditor name"): extracted["audit_info"]["auditor_name"] = val
+                elif _canon(label) == _canon("Audit Matrix Identifier (Name or Number)"): extracted["audit_info"]["matrix_id"] = val
+        # 2) Operator Information (prefer table rows)
+        operator_info = extracted_data.get("operator_information", {})
+        if operator_info:
+            extracted["operator_info"] = {
+                "name": "",
+                "trading_name": _smart_space(operator_info.get("trading_name", "")),
+                "acn": _smart_space(operator_info.get("company_number", "")),
+                "manual": _smart_space(operator_info.get("nhvas_accreditation", "")),
+                "business_address": _smart_space(operator_info.get("business_address", "")),
+                "postal_address": _smart_space(operator_info.get("postal_address", "")),
+                "email": operator_info.get("email", ""),
+                "phone": _smart_space(operator_info.get("phone", "")),
+            }
+        # Fill operator info via table lookup
+        for label, meta in LABEL_INDEX.get("Operator Information", {}).items():
+            val = self._find_table_value(tables, meta.get("alts", [label]))
+            if not val: continue
+            if _canon(label) == _canon("Operator name (Legal entity)") and _looks_like_company(val):
+                extracted.setdefault("operator_info", {})
+                extracted["operator_info"]["name"] = val
+            elif _canon(label) == _canon("Registered trading name/s"):
+                extracted.setdefault("operator_info", {})
+                extracted["operator_info"]["trading_name"] = val
+            elif _canon(label) == _canon("Australian Company Number"):
+                extracted.setdefault("operator_info", {})
+                extracted["operator_info"]["acn"] = val
+            elif _canon(label) == _canon("NHVAS Manual (Policies and Procedures) developed by"):
+                extracted.setdefault("operator_info", {})
+                if _looks_like_manual_value(val):
+                    extracted["operator_info"]["manual"] = val
+        # 3) Generic table parsing (unchanged logic for other sections)
+        self._extract_table_data(tables, extracted)
+        # 4) Text parsing (kept, but spacing applied)
+        self._extract_text_content(extracted_data.get("all_text_content", []), extracted)
+        # Vehicle tables sometimes fail to land in all_tables; parse from text as a fallback
+        self._extract_vehicle_tables_from_text(extracted_data.get("all_text_content", []), extracted)
+        # 5) Vehicle/Driver data (kept)
+        self._extract_vehicle_driver_data(extracted_data, extracted)
+        # 6) Detailed mgmt (kept)
+        self._extract_detailed_management_data(extracted_data, extracted)
+        return extracted
+    # ───────────────────────────── table classifiers ─────────────────────────────
+    # replace your _extract_table_data with this version
+    def _extract_table_data(self, tables: List[Dict], extracted: Dict):
+        for table in tables:
+            headers   = table.get("headers", []) or []
+            data_rows = table.get("data", []) or []
+            if not data_rows:
+                continue
+            page_num = table.get("page", 0)
+            self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
+            # 🔧 NEW: collapse possible multi-line headers once up front
+            collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
+            # 🔧 Try vehicle tables FIRST using either raw or collapsed headers
+            if self._is_vehicle_registration_table(headers) or self._is_vehicle_registration_table(collapsed_headers):
+                # always extract with the collapsed header/rows so we see "Registration Number", etc.
+                self._extract_vehicle_registration_table(collapsed_headers, collapsed_rows, extracted, page_num)
+                continue
+            # the rest keep their existing order/logic (use the original headers/rows)
+            if self._is_audit_info_table(headers):
+                self._extract_audit_info_table(data_rows, extracted)
+            elif self._is_operator_info_table(headers):
+                self._extract_operator_info_table(data_rows, extracted)
+            elif self._is_attendance_table(headers):
+                self._extract_attendance_table(data_rows, extracted)
+            elif self._is_vehicle_summary_table(headers):
+                self._extract_vehicle_summary_table(data_rows, extracted)
+            elif self._is_driver_table(headers):
+                self._extract_driver_table(headers, data_rows, extracted)
+            elif self._is_management_compliance_table(headers):
+                self._extract_management_table(data_rows, extracted, headers)
+    def _is_audit_info_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return any(t in txt for t in ["audit", "date", "location", "auditor"])
+    def _is_operator_info_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return any(t in txt for t in ["operator", "company", "trading", "address"])
+    def _is_attendance_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return "attendance" in txt
+    def _is_vehicle_summary_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return any(t in txt for t in ["powered vehicles", "trailing vehicles", "drivers in bfm"])
+    def _is_vehicle_registration_table(self, headers: List[str]) -> bool:
+        if not headers: return False
+        ch = [_canon_header(h) for h in headers]
+        has_reg = any(
+            ("registration" in h) or re.search(r"\breg(?:istration)?\b", h) or ("reg" in h and "no" in h)
+            for h in ch
+        )
+        others = ["roadworthiness","maintenance records","daily checks","fault recording","fault repair",
+                "sub contractor","sub-contractor","weight verification","rfs suspension","suspension system maintenance",
+                "trip records","fault recording reporting on suspension system","fault reporting suspension"]
+        has_signal = any(any(tok in h for tok in others) for h in ch)
+        return has_reg and has_signal
+    def _is_driver_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return any(t in txt for t in ["driver", "scheduler", "tlif", "medical"])
+    def _is_management_compliance_table(self, headers: List[str]) -> bool:
+        txt = " ".join(str(h) for h in headers).lower()
+        return any(t in txt for t in ["maintenance management", "mass management", "fatigue management"])
+    def _extract_vehicle_tables_from_text(self, text_pages: List[Dict], extracted: Dict):
+        # flatten text
+        lines = []
+        for p in text_pages or []:
+            for ln in re.split(r"\s*\n\s*", p.get("text", "")):
+                ln = _smart_space(ln)
+                if ln: lines.append(ln)
+        maint_rows, mass_rows = [], []
+        rf_pat = re.compile(r"\bRF\s*\d+\b", re.IGNORECASE)
+        for ln in lines:
+            # find first token that looks like a rego
+            tokens = ln.split()
+            reg = next((t for t in tokens if looks_like_plate(t)), None)
+            if not reg:
+                continue
+            # everything after the reg on that line
+            tail = _smart_space(ln.split(reg, 1)[1]) if reg in ln else ""
+            dates = extract_date_tokens(tail)
+            has_rf = bool(rf_pat.search(ln)) or "suspension" in ln.lower()
+            if has_rf:
+                rfs = (rf_pat.search(ln).group(0).upper().replace(" ", "") if rf_pat.search(ln) else "")
+                wv = dates[0] if len(dates) > 0 else ""
+                rest = dates[1:]
+                mass_rows.append({
+                    "registration": reg,
+                    "sub_contractor": "Yes" if " yes " in f" {ln.lower()} " else ("No" if " no " in f" {ln.lower()} " else ""),
+                    "sub_comp":      "Yes" if " yes " in f" {ln.lower()} " else ("No" if " no " in f" {ln.lower()} " else ""),
+                    "weight_verification": wv,
+                    "rfs_certification": rfs or ("N/A" if "n/a" in ln.lower() else ""),
+                    "suspension_maintenance": rest[0] if len(rest) > 0 else "",
+                    "trip_records":            rest[1] if len(rest) > 1 else "",
+                    "fault_reporting_suspension": rest[2] if len(rest) > 2 else "",
+                })
+            else:
+                # map first 5 date-like tokens in sensible order; fallbacks keep table consistent
+                rw = dates[0] if len(dates) > 0 else ""
+                mr = dates[1] if len(dates) > 1 else ""
+                dc = dates[2] if len(dates) > 2 else ""
+                fr = dates[3] if len(dates) > 3 else ""
+                rp = dates[4] if len(dates) > 4 else ""
+                maint_rows.append({
+                    "registration": reg,
+                    "roadworthiness": rw,
+                    "maintenance_records": mr or dc,
+                    "daily_checks": dc,
+                    "fault_recording": fr or rp,
+                    "fault_repair": rp or fr,
+                })
+            # ... after building maint_rows and mass_rows ...
+        vlist = extracted.setdefault("vehicles", [])  # ensure it always exists
+        if maint_rows or mass_rows:
+            for r in maint_rows:
+                r["section"] = "maintenance"
+                vlist.append(r)
+            for r in mass_rows:
+                r["section"] = "mass"
+                vlist.append(r)
+            self.log_debug(f"Vehicle rows (text fallback): maint={len(maint_rows)} mass={len(mass_rows)} total={len(vlist)}")
+        else:
+            self.log_debug("Vehicle rows (text fallback): none detected.")
+    # ───────────────────────────── simple extractors (spacing applied) ─────────────────────────────
+    def _extract_audit_info_table(self, data_rows: List[List], extracted: Dict):
+        ai = extracted.setdefault("audit_info", {})
+        for row in data_rows:
+            if len(row) < 2: continue
+            key = _canon(row[0])
+            val = _smart_space(" ".join(str(c).strip() for c in row[1:] if str(c).strip()))
+            if not val: continue
+            if "date" in key and "audit" in key: ai["date_of_audit"] = val
+            elif "location" in key: ai["location"] = val
+            elif "auditor" in key and "name" in key: ai["auditor_name"] = val
+            elif "matrix" in key: ai["matrix_id"] = val
+    def _extract_operator_info_table(self, data_rows: List[List], extracted: Dict):
+        oi = extracted.setdefault("operator_info", {})
+        for row in data_rows:
+            if len(row) < 2: continue
+            key = _canon(row[0])
+            val = _smart_space(" ".join(str(c).strip() for c in row[1:] if str(c).strip()))
+            if not val: continue
+            if "operator" in key and "name" in key and _looks_like_company(val): oi["name"] = val
+            elif "trading" in key: oi["trading_name"] = val
+            elif "australian" in key and "company" in key: oi["acn"] = val
+            elif "business" in key and "address" in key: oi["business_address"] = val
+            elif "postal" in key and "address" in key: oi["postal_address"] = val
+            elif "email" in key: oi["email"] = val
+            elif "telephone" in key or "phone" in key: oi["phone"] = val
+            elif "manual" in key or ("nhvas" in key and "manual" in key) or "developed" in key:
+                if _looks_like_manual_value(val):
+                    oi["manual"] = val
+    def _extract_attendance_table(self, data_rows: List[List], extracted: Dict):
+        lst = []
+        for row in data_rows:
+            if not row: continue
+            cells = [str(c).strip() for c in row if str(c).strip()]
+            if not cells: continue
+            lst.append(_smart_space(" ".join(cells)))
+        if lst:
+            extracted["attendance"] = lst
+    def _extract_vehicle_summary_table(self, data_rows: List[List], extracted: Dict):
+        vs = extracted.setdefault("vehicle_summary", {})
+        for row in data_rows:
+            if len(row) < 2: continue
+            key = _canon(row[0])
+            value = ""
+            for c in row[1:]:
+                if str(c).strip():
+                    value = _smart_space(str(c).strip()); break
+            if not value: continue
+            if "powered" in key and "vehicle" in key: vs["powered_vehicles"] = value
+            elif "trailing" in key and "vehicle" in key: vs["trailing_vehicles"] = value
+            elif "drivers" in key and "bfm" in key: vs["drivers_bfm"] = value
+            elif "drivers" in key and "afm" in key: vs["drivers_afm"] = value
+    # ▶▶ REPLACED: column mapping by headers
+    def _extract_vehicle_registration_table(self, headers, rows, extracted, page_num):
+        ch    = [_canon_header(h) for h in (headers or [])]
+        alias = _map_header_indices(headers or [])
+        # header indices (may be misaligned vs data; that's OK, we’ll search near them)
+        def idx_of(*needles):
+            for i, h in enumerate(ch):
+                if all(n in h for n in needles): return i
+            return None
+        reg_i   = alias.get("registration") or idx_of("registration number") or idx_of("registration") or idx_of("reg","no")
+        rw_i    = alias.get("roadworthiness") or idx_of("roadworthiness")
+        maint_i = alias.get("maintenance_records") or idx_of("maintenance","records")
+        daily_i = alias.get("daily_checks") or idx_of("daily","check")
+        fr_i    = alias.get("fault_recording") or idx_of("fault","recording")
+        rep_i   = alias.get("fault_repair")    or idx_of("fault","repair")
+        weight_i = alias.get("weight_verification") or idx_of("weight","verification")
+        rfs_i    = alias.get("rfs_certification")   or idx_of("rfs","certification")
+        susp_i   = alias.get("suspension_maintenance") or idx_of("suspension","maintenance")
+        trip_i   = alias.get("trip_records") or idx_of("trip","records")
+        frs_i    = alias.get("fault_reporting_suspension") or idx_of("fault","reporting","suspension")
+        # classify table type by header signals
+        is_maint = any("roadworthiness" in h or "maintenance records" in h or ("daily" in h and "check" in h) or "fault repair" in h for h in ch)
+        is_mass  = any("weight verification" in h or "rfs" in h or "suspension system" in h or "trip records" in h or "reporting on suspension" in h for h in ch)
+        maint_rows = extracted.setdefault("_maint_rows", []) if is_maint else None
+        added = 0
+        for r in rows or []:
+            # tolerant plate pick (handles misaligned columns)
+            reg = self._pick_nearby(r, reg_i, "plate", window=4)
+            if not reg or not looks_like_plate(reg):
+                continue
+            # collect values using tolerant picks
+            if is_maint:
+                rw  = self._pick_nearby(r, rw_i,    "date", window=4)
+                mr  = self._pick_nearby(r, maint_i, "date", window=4)
+                dc  = self._pick_nearby(r, daily_i, "date", window=4)
+                fr  = self._pick_nearby(r, fr_i,    "date", window=4)
+                rep = self._pick_nearby(r, rep_i,   "date", window=4)
+                # sensible fallbacks
+                if not mr and dc: mr = dc
+                if not rep and fr: rep = fr
+                if not fr and rep: fr = rep
+            else:  # mass or mixed
+                wv  = self._pick_nearby(r, weight_i, "date", window=4)
+                rfs = self._pick_nearby(r, rfs_i,    "rf",   window=5)
+                sm  = self._pick_nearby(r, susp_i,   "date", window=4)
+                tr  = self._pick_nearby(r, trip_i,   "date", window=4)
+                frs = self._pick_nearby(r, frs_i,    "date", window=4)
+                yn1 = self._pick_nearby(r, idx_of("sub","contractor"), "yn", window=3) or ""
+                yn2 = self._pick_nearby(r, idx_of("sub contracted vehicles statement of compliance"), "yn", window=3) or yn1
+            # merge into vehicle map
+            v = self._vehicle_by_reg.get(reg)
+            if v is None:
+                v = {"registration": reg}
+                self._vehicle_by_reg[reg] = v
+                added += 1
+            if is_maint:
+                v["seen_in_maintenance"] = True
+                if rw:  v.setdefault("roadworthiness", rw)
+                if mr:  v.setdefault("maintenance_records", mr)
+                if dc:  v.setdefault("daily_checks", dc)
+                if fr:  v.setdefault("fault_recording", fr)
+                if rep: v.setdefault("fault_repair", rep)
+                if maint_rows is not None:
+                    maint_rows.append({
+                        "registration": reg,
+                        "roadworthiness": rw,
+                        "maintenance_records": mr or dc,
+                        "daily_checks": dc,
+                        "fault_recording": fr or rep,
+                        "fault_repair": rep or fr,
+                    })
+            else:
+                v["seen_in_mass"] = True
+                if yn1: v.setdefault("sub_contractor", yn1)
+                if yn2: v.setdefault("sub_comp", yn2)
+                if wv:  v.setdefault("weight_verification", wv)
+                if rfs: v.setdefault("rfs_certification", _smart_space(rfs).upper().replace(" ", ""))
+                if sm:  v.setdefault("suspension_maintenance", sm)
+                if tr:  v.setdefault("trip_records", tr)
+                if frs: v.setdefault("fault_reporting_suspension", frs)
+        extracted["vehicles"] = list(self._vehicle_by_reg.values())
+        return added
+    def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
+        """Header-driven extraction for Driver / Scheduler Records."""
+        drivers = []
+        ch = [_canon_header(h) for h in headers or []]
+        # helpers
+        def find_col(needles: list[str]) -> Optional[int]:
+            for i, h in enumerate(ch):
+                if any(n in h for n in needles):
+                    return i
+            return None
+        def find_col_rx(patterns: list[str]) -> Optional[int]:
+            for i, h in enumerate(ch):
+                if any(re.search(p, h) for p in patterns):
+                    return i
+            return None
+        name_idx   = find_col_rx([r"\bdriver\s*/\s*scheduler\s*name\b",
+                              r"\bdriver\s+name\b", r"\bscheduler\s+name\b", r"\bname\b"])
+        tlif_d_idx = find_col(["driver tlif"])
+        tlif_s_idx = find_col(["scheduler tlif"])
+        medical_idx= find_col(["medical", "expiry"])
+        roster_idx = find_col_rx([r"\broster\b", r"\bsafe\s+driving\s+plan\b", r"\bschedule\b(?!r\b)"])
+        fit_idx    = find_col(["fit for duty"])
+        diary_idx  = find_col(["work diary", "electronic work diary", "page numbers"])
+        for row in data_rows:
+            if not row:
+                continue
+            name = None
+            if name_idx is not None and name_idx < len(row):
+                name = _smart_space(str(row[name_idx]).strip())
+            if not name:
+                continue
+            d = {"name": name}
+            if tlif_d_idx is not None and tlif_d_idx < len(row):
+                d["driver_tlif"] = _smart_space(str(row[tlif_d_idx]).strip())
+            if tlif_s_idx is not None and tlif_s_idx < len(row):
+                d["scheduler_tlif"] = _smart_space(str(row[tlif_s_idx]).strip())
+            if medical_idx is not None and medical_idx < len(row):
+                d["medical_expiry"] = _smart_space(str(row[medical_idx]).strip())
+            # Roster/Schedule/SDP: prefer the detected column; accept only date/range-like, not the name
+            if roster_idx is not None and roster_idx < len(row):
+                raw_roster = _smart_space(str(row[roster_idx]).strip())
+                if raw_roster and re.search(r"[0-9/–-]", raw_roster) and raw_roster.lower() != name.lower():
+                    d["roster_schedule"] = raw_roster
+            # Fallback: scan the row for the first date/range-like cell that's not the name cell
+            if "roster_schedule" not in d:
+                for j, cell in enumerate(row):
+                    if j == name_idx:
+                        continue
+                    s = _smart_space(str(cell).strip())
+                if s and re.search(r"[0-9/–-]", s) and s.lower() != name.lower():
+                    d["roster_schedule"] = s
+                    break
+            if fit_idx is not None and fit_idx < len(row):
+                d["fit_for_duty"] = _smart_space(str(row[fit_idx]).strip())
+            if diary_idx is not None and diary_idx < len(row):
+                d["work_diary"] = _smart_space(str(row[diary_idx]).strip())
+            drivers.append(d)
+        if drivers:
+            extracted["drivers_detailed"] = drivers
+            self.log_debug(f"Driver rows extracted (header-based): {len(drivers)}")
+    def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
+        txt = " ".join(str(h) for h in headers).lower()
+        comp = {}
+        for row in data_rows:
+            if len(row) < 2: continue
+            std = str(row[0]).strip()
+            val = _smart_space(str(row[1]).strip())
+            if std.startswith("Std") and val:
+                comp[std] = val
+        if comp:
+            if "maintenance" in txt: extracted["maintenance_compliance"] = comp
+            elif "mass" in txt: extracted["mass_compliance"] = comp
+            elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
+    def _extract_text_content(self, text_pages: List[Dict], extracted: Dict):
+        all_text = " ".join(page.get("text", "") for page in text_pages)
+        all_text = _smart_space(all_text)
+        # business summary
+        patt = [
+            r"Nature of the Operators? Business.*?:\s*(.*?)(?:Accreditation Number|Expiry Date|$)",
+            r"Nature of.*?Business.*?Summary.*?:\s*(.*?)(?:Accreditation|$)"
+        ]
+        for p in patt:
+            m = re.search(p, all_text, re.IGNORECASE | re.DOTALL)
+            if m:
+                txt = re.sub(r'\s+', ' ', m.group(1).strip())
+                txt = re.sub(r'\s*(Accreditation Number.*|Expiry Date.*)', '', txt, flags=re.IGNORECASE)
+                if len(txt) > 50:
+                    extracted["business_summary"] = txt
+                    break
+        # audit conducted date
+        for p in [
+            r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
+            r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
+            r"AUDITOR SIGNATURE\s+DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})"
+        ]:
+            m = re.search(p, all_text, re.IGNORECASE)
+            if m:
+                extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
+                break
+        # print accreditation name
+        for p in [
+            r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
+            r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
+        ]:
+            m = re.search(p, all_text, re.IGNORECASE)
+            if m:
+                extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
+                break
+        # numbers in text (optional)
+        for p in [
+            r"Number of powered vehicles\s+(\d+)",
+            r"powered vehicles\s+(\d+)",
+            r"Number of trailing vehicles\s+(\d+)",
+            r"trailing vehicles\s+(\d+)",
+            r"Number of drivers in BFM\s+(\d+)",
+            r"drivers in BFM\s+(\d+)"
+        ]:
+            m = re.search(p, all_text, re.IGNORECASE)
+            if m:
+                val = m.group(1)
+                if "powered" in p: extracted.setdefault("vehicle_summary", {})["powered_vehicles"] = val
+                elif "trailing" in p: extracted.setdefault("vehicle_summary", {})["trailing_vehicles"] = val
+                elif "bfm" in p.lower(): extracted.setdefault("vehicle_summary", {})["drivers_bfm"] = val
+    def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
+        all_tables = extracted_data.get("all_tables", [])
+        for table in all_tables:
+            headers = table.get("headers", [])
+            data_rows = table.get("data", [])
+            page_num = table.get("page", 0)
+            if self._has_details_column(headers):
+                section = self._identify_management_section(headers)
+                if section:
+                    self._extract_management_details(data_rows, extracted, section)
+            elif 6 <= page_num <= 15:
+                self._extract_summary_by_content(data_rows, headers, extracted, page_num)
+    def _extract_summary_by_content(self, data_rows: List[List], headers: List[str], extracted: Dict, page_num: int):
+        section_type = "maintenance" if 6 <= page_num <= 9 else "mass" if 10 <= page_num <= 12 else "fatigue" if 13 <= page_num <= 15 else None
+        if not section_type: return
+        details_key = f"{section_type}_summary_details"
+        extracted[details_key] = {}
+        for row in data_rows:
+            if len(row) < 2: continue
+            standard = str(row[0]).strip()
+            details = _smart_space(str(row[1]).strip())
+            if standard.startswith("Std") and details and len(details) > 10:
+                m = re.search(r"Std\s+(\d+)\.\s*([^(]+)", standard)
+                if m:
+                    key = f"Std {m.group(1)}. {m.group(2).strip()}"
+                    extracted[details_key][key] = details
+    def _has_details_column(self, headers: List[str]) -> bool:
+        return "details" in " ".join(str(h) for h in headers).lower()
+    def _identify_management_section(self, headers: List[str]) -> Optional[str]:
+        txt = " ".join(str(h) for h in headers).lower()
+        if "maintenance" in txt: return "maintenance"
+        if "mass" in txt: return "mass"
+        if "fatigue" in txt: return "fatigue"
+        return None
+    def _extract_management_details(self, data_rows: List[List], extracted: Dict, section: str):
+        details_key = f"{section}_details"
+        extracted[details_key] = {}
+        for row in data_rows:
+            if len(row) < 2: continue
+            standard = str(row[0]).strip()
+            details = _smart_space(str(row[1]).strip())
+            if standard.startswith("Std") and details and details != "V" and len(details) > 10:
+                m = re.search(r"Std\s+\d+\.\s*([^(]+)", standard)
+                if m:
+                    extracted[details_key][m.group(1).strip()] = details
+    def _extract_vehicle_driver_data(self, extracted_data: Dict, extracted: Dict):
+        vehicle_regs = extracted_data.get("vehicle_registrations", [])
+        if vehicle_regs:
+            extracted["vehicle_registrations"] = vehicle_regs
+        driver_records = extracted_data.get("driver_records", [])
+        if driver_records:
+            extracted["driver_records"] = driver_records
+    # Add this method inside your NHVASMerger class, with proper indentation
+    # Place it after the _extract_vehicle_driver_data method
+    def map_vehicle_registration_arrays(self, pdf_extracted: Dict, merged: Dict):
+        """Extract and map vehicle registration data (Maintenance + Mass) to DOCX arrays."""
+        vehicles_src = []
+        # Prefer rows we parsed ourselves (header-based). Fall back to curated list if present.
+        if "vehicles" in pdf_extracted and isinstance(pdf_extracted["vehicles"], list):
+            vehicles_src = pdf_extracted["vehicles"]
+        elif "vehicle_registrations" in pdf_extracted and isinstance(pdf_extracted["vehicle_registrations"], list):
+            # Normalize curated structure (list of dicts with keys like 'registration_number', etc.)
+            for row in pdf_extracted["vehicle_registrations"]:
+                if not isinstance(row, dict):
+                    continue
+                v = {
+                "registration": _smart_space(row.get("registration_number") or row.get("registration") or ""),
+                # Maintenance table columns (names as seen in curated JSON)
+                "roadworthiness": _smart_space(row.get("roadworthiness_certificates", "")),
+                "maintenance_records": _smart_space(row.get("maintenance_records", "")),
+                "daily_checks": _smart_space(row.get("daily_checks", "")),
+                "fault_recording": _smart_space(row.get("fault_recording_reporting", "")),
+                "fault_repair": _smart_space(row.get("fault_repair", "")),
+                # Mass table columns (in case the curated list ever includes them)
+                "sub_contractor": _smart_space(row.get("sub_contractor", "")),
+                "sub_comp": _smart_space(row.get("sub_contracted_vehicles_statement_of_compliance", "")),
+                "weight_verification": _smart_space(row.get("weight_verification_records", "")),
+                "rfs_certification": _smart_space(row.get("rfs_suspension_certification", row.get("rfs_suspension_certification_#", ""))),
+                "suspension_maintenance": _smart_space(row.get("suspension_system_maintenance", "")),
+                "trip_records": _smart_space(row.get("trip_records", "")),
+                "fault_reporting_suspension": _smart_space(row.get("fault_recording_reporting_on_suspension_system", "")),
+            }
+            if v["registration"]:
+                vehicles_src.append(v)
+        if not vehicles_src:
+            return  # nothing to map
+        # Build column arrays
+        regs = []
+        roadworthiness = []
+        maint_records = []
+        daily_checks = []
+        fault_recording = []
+        fault_repair = []
+        sub_contractors = []
+        weight_verification = []
+        rfs_certification = []
+        suspension_maintenance = []
+        trip_records = []
+        fault_reporting_suspension = []
+        for v in vehicles_src:
+            reg = _smart_space(v.get("registration", "")).strip()
+            if not reg:
+                continue
+            regs.append(reg)
+        roadworthiness.append(_smart_space(v.get("roadworthiness", "")).strip())
+        maint_records.append(_smart_space(v.get("maintenance_records", "")).strip())
+        daily_checks.append(_smart_space(v.get("daily_checks", "")).strip())
+        fault_recording.append(_smart_space(v.get("fault_recording", "")).strip())
+        fault_repair.append(_smart_space(v.get("fault_repair", "")).strip())
+        sub_contractors.append(_smart_space(v.get("sub_contractor", "")).strip())
+        weight_verification.append(_smart_space(v.get("weight_verification", "")).strip())
+        rfs_certification.append(_smart_space(v.get("rfs_certification", "")).strip())
+        suspension_maintenance.append(_smart_space(v.get("suspension_maintenance", "")).strip())
+        trip_records.append(_smart_space(v.get("trip_records", "")).strip())
+        fault_reporting_suspension.append(_smart_space(v.get("fault_reporting_suspension", "")).strip())
+        # Update Maintenance table arrays (if present in template)
+        if "Vehicle Registration Numbers Maintenance" in merged and regs:
+            m = merged["Vehicle Registration Numbers Maintenance"]
+            m["Registration Number"] = regs
+            m["Roadworthiness Certificates"] = roadworthiness
+            m["Maintenance Records"] = maint_records
+            m["Daily Checks"] = daily_checks
+            m["Fault Recording/ Reporting"] = fault_recording
+            m["Fault Repair"] = fault_repair
+        # Update Mass table arrays (if present in template)
+        if "Vehicle Registration Numbers Mass" in merged and regs:
+            ms = merged["Vehicle Registration Numbers Mass"]
+            ms["Registration Number"] = regs
+            ms["Sub contractor"] = sub_contractors
+            ms["Weight Verification Records"] = weight_verification
+            ms["RFS Suspension Certification #"] = rfs_certification
+            ms["Suspension System Maintenance"] = suspension_maintenance
+            ms["Trip Records"] = trip_records
+            ms["Fault Recording/ Reporting on Suspension System"] = fault_reporting_suspension
+        self.log_debug(f"Updated vehicle registration arrays for {len(regs)} vehicles")
+    # ───────────────────────────── map to DOCX (apply spacing + safe fallbacks) ─────────────────────────────
+    def map_to_docx_structure(self, pdf_extracted: Dict, docx_data: Dict, pdf_data: Dict) -> Dict:
+        merged = json.loads(json.dumps(docx_data))
+        # Audit Information
+        if "audit_info" in pdf_extracted and "Audit Information" in merged:
+            ai = pdf_extracted["audit_info"]
+            if ai.get("date_of_audit"):
+                merged["Audit Information"]["Date of Audit"] = [_smart_space(ai["date_of_audit"])]
+            if ai.get("location"):
+                merged["Audit Information"]["Location of audit"] = [_smart_space(ai["location"])]
+            if ai.get("auditor_name"):
+                merged["Audit Information"]["Auditor name"] = [_smart_space(ai["auditor_name"])]
+            if ai.get("matrix_id"):
+                merged["Audit Information"]["Audit Matrix Identifier (Name or Number)"] = [_smart_space(ai["matrix_id"])]
+        # Operator Information
+        if "operator_info" in pdf_extracted and "Operator Information" in merged:
+            op = pdf_extracted["operator_info"]
+            if op.get("name") and _looks_like_company(op["name"]):
+                merged["Operator Information"]["Operator name (Legal entity)"] = [_smart_space(op["name"])]
+            if op.get("trading_name"):
+                merged["Operator Information"]["Registered trading name/s"] = [_smart_space(op["trading_name"])]
+            if op.get("acn"):
+                merged["Operator Information"]["Australian Company Number"] = [_smart_space(op["acn"])]
+            if op.get("manual"):
+                merged["Operator Information"]["NHVAS Manual (Policies and Procedures) developed by"] = [_smart_space(op["manual"])]
+        # Contact details
+        if "operator_info" in pdf_extracted and "Operator contact details" in merged:
+            op = pdf_extracted["operator_info"]
+            if op.get("business_address"):
+                merged["Operator contact details"]["Operator business address"] = [_smart_space(op["business_address"])]
+            if op.get("postal_address"):
+                merged["Operator contact details"]["Operator Postal address"] = [_smart_space(op["postal_address"])]
+            if op.get("email"):
+                merged["Operator contact details"]["Email address"] = [op["email"]]
+            if op.get("phone"):
+                merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
+        # Attendance
+        if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
+            merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"] = _clean_list(pdf_extracted["attendance"])
+        # Business summary
+        if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
+            merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
+        # Vehicle summary
+        if "vehicle_summary" in pdf_extracted:
+            vs = pdf_extracted["vehicle_summary"]
+            if "Accreditation Vehicle Summary" in merged:
+                if vs.get("powered_vehicles"):
+                    merged["Accreditation Vehicle Summary"]["Number of powered vehicles"] = [vs["powered_vehicles"]]
+                if vs.get("trailing_vehicles"):
+                    merged["Accreditation Vehicle Summary"]["Number of trailing vehicles"] = [vs["trailing_vehicles"]]
+            if "Accreditation Driver Summary" in merged:
+                if vs.get("drivers_bfm"):
+                    merged["Accreditation Driver Summary"]["Number of drivers in BFM"] = [vs["drivers_bfm"]]
+                if vs.get("drivers_afm"):
+                    merged["Accreditation Driver Summary"]["Number of drivers in AFM"] = [vs["drivers_afm"]]
+        # Summary sections (unchanged behavior)
+        summary_maps = self.build_summary_maps(pdf_data)
+        for section_name, std_map in summary_maps.items():
+            if section_name in merged and std_map:
+                for detail_key, details_list in std_map.items():
+                    if detail_key in merged[section_name]:
+                        merged[section_name][detail_key] = details_list
+                        continue
+                    for docx_key in list(merged[section_name].keys()):
+                        m1 = re.search(r"Std\s+(\d+)", detail_key)
+                        m2 = re.search(r"Std\s+(\d+)", docx_key)
+                        if m1 and m2 and m1.group(1) == m2.group(1):
+                            merged[section_name][docx_key] = details_list
+                            break
+        # Vehicle registration arrays via consolidated builder
+        sections = build_vehicle_sections(pdf_extracted)
+        if "Vehicle Registration Numbers Maintenance" in merged:
+            merged["Vehicle Registration Numbers Maintenance"].update(
+                sections["Vehicle Registration Numbers Maintenance"]
+            )
+        if "Vehicle Registration Numbers Mass" in merged:
+            merged["Vehicle Registration Numbers Mass"].update(
+                sections["Vehicle Registration Numbers Mass"]
+            )
+        # replace the whole Drivers/Scheduler block with:
+        if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
+            drivers = pdf_extracted["drivers_detailed"]
+            def _looks_like_range(s):
+                return bool(re.search(r"[0-9]{1,2}[/-]", s or ""))
+            merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
+            merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"]          = [d.get("fit_for_duty","") for d in drivers]
+            merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
+        # --- Print accreditation name (robust, no UnboundLocalError) ---
+        if "Print accreditation name" in merged:
+            acc_name = ""  # init
+            acc_name = _smart_space(pdf_extracted.get("print_accreditation_name") or "")
+            if not acc_name:
+                oi = pdf_extracted.get("operator_info") or {}
+                acc_name = _smart_space(oi.get("name") or "") or _smart_space(oi.get("trading_name") or "")
+            if acc_name:
+                merged["Print accreditation name"]["(print accreditation name)"] = [acc_name]
+        # Audit Declaration dates: prefer explicit extracted date; fallback to audit_info; ignore literal "Date"
+        if "Audit Declaration dates" in merged:
+            def _real_date(s: Optional[str]) -> bool:
+                return bool(s and re.search(r"\d", s) and not re.fullmatch(r"date", s.strip(), re.I))
+            val = pdf_extracted.get("audit_conducted_date")
+            if not _real_date(val):
+                val = (pdf_extracted.get("audit_info", {}) or {}).get("date_of_audit")
+            if _real_date(val):
+                merged["Audit Declaration dates"]["Audit was conducted on"] = [_smart_space(val)]
+        # Operator Declaration: page 22 image missing → derive from first Attendance "Name - Title"
+        if "Operator Declaration" in merged:
+            # If an explicit operator declaration exists, use it
+            if "operator_declaration" in pdf_extracted:
+                od = pdf_extracted["operator_declaration"]
+                pn = _smart_space(od.get("print_name", ""))
+                pt = _smart_space(od.get("position_title", ""))
+                if pn:
+                    merged["Operator Declaration"]["Print Name"] = [pn]
+                if pt:
+                    merged["Operator Declaration"]["Position Title"] = [pt]
+            else:
+                # Fallback: first "Name - Title" from Attendance
+                nt = self._first_attendance_name_title(pdf_extracted.get("attendance", []))
+                if nt:
+                    merged["Operator Declaration"]["Print Name"] = [nt[0]]
+                    merged["Operator Declaration"]["Position Title"] = [nt[1]]
+        # Paragraphs: fill company name for the 3 management headings; set the 2 dates
+        if "paragraphs" in merged:
+            paras = merged["paragraphs"]
+            audit_date = (
+                pdf_extracted.get("audit_conducted_date")
+                or pdf_extracted.get("audit_info", {}).get("date_of_audit")
+            )
+            # Prefer accreditation name, else operator legal name, else trading name
+            company_name = (
+                _smart_space(pdf_extracted.get("print_accreditation_name") or "")
+                or _smart_space(pdf_extracted.get("operator_info", {}).get("name") or "")
+                or _smart_space(pdf_extracted.get("operator_info", {}).get("trading_name") or "")
+            )
+            # Update the three layered headings
+            for key in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
+                if key in paras and company_name:
+                    paras[key] = [company_name]
+            # Second-last page: date under page heading
+            if "NHVAS APPROVED AUDITOR DECLARATION" in paras and audit_date:
+                paras["NHVAS APPROVED AUDITOR DECLARATION"] = [_smart_space(audit_date)]
+            # Last page: date under long acknowledgement paragraph
+            ack_key = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
+                    "I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
+            if ack_key in paras and audit_date:
+                paras[ack_key] = [_smart_space(audit_date)]
+        self._force_fill_maintenance_from_tables(pdf_data, merged)
+        return merged
+    # ───────────────────────────── merge & CLI (unchanged) ─────────────────────────────
+    def merge_pdf_to_docx(self, docx_data: Dict, pdf_data: Dict) -> Dict:
+        self.log_debug("Starting comprehensive PDF extraction...")
+        pdf_extracted = self.extract_from_pdf_comprehensive(pdf_data)
+        self.log_debug(f"Extracted PDF data keys: {list(pdf_extracted.keys())}")
+        self.log_debug("Mapping to DOCX structure...")
+        merged_data = self.map_to_docx_structure(pdf_extracted, docx_data, pdf_data)
+        for section_name, section_data in docx_data.items():
+            if isinstance(section_data, dict):
+                for label in section_data:
+                    if (section_name in merged_data and
+                        label in merged_data[section_name] and
+                        merged_data[section_name][label] != docx_data[section_name][label]):
+                        print(f"✓ Updated {section_name}.{label}: {merged_data[section_name][label]}")
+        return merged_data
+    def process_files(self, docx_file: str, pdf_file: str, output_file: str):
+        try:
+            print(f"Loading DOCX JSON from: {docx_file}")
+            with open(docx_file, 'r', encoding='utf-8') as f:
+                docx_data = json.load(f)
+            print(f"Loading PDF JSON from: {pdf_file}")
+            with open(pdf_file, 'r', encoding='utf-8') as f:
+                pdf_data = json.load(f)
+            print("Merging PDF data into DOCX structure...")
+            merged_data = self.merge_pdf_to_docx(docx_data, pdf_data)
+            print(f"Saving merged data to: {output_file}")
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(merged_data, f, indent=2, ensure_ascii=False)
+            print("✅ Merge completed successfully!")
+            return merged_data
+        except Exception as e:
+            print(f"❌ Error processing files: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            raise
+def main():
+    if len(sys.argv) != 4:
+        print("Usage: python nhvas_merger.py <docx_json_file> <pdf_json_file> <output_file>")
+        print("Example: python nhvas_merger.py docx_template.json pdf_extracted.json merged_output.json")
+        sys.exit(1)
+    docx_file = sys.argv[1]
+    pdf_file = sys.argv[2]
+    output_file = sys.argv[3]
+    for file_path in [docx_file, pdf_file]:
+        if not Path(file_path).exists():
+            print(f"❌ File not found: {file_path}")
+            sys.exit(1)
+    merger = NHVASMerger()
+    merger.process_files(docx_file, pdf_file, output_file)
+if __name__ == "__main__":
+    main()

space-pdf/updated_word.py ADDED Viewed

	@@ -0,0 +1,1189 @@

+#!/usr/bin/env python3
+# update_docx_from_json.py
+import sys, json, re
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+from docx import Document
+from docx.shared import RGBColor, Pt  # add Pt
+from docx.table import _Cell, Table
+from docx.text.paragraph import Paragraph
+from copy import deepcopy
+from docx.oxml.ns import qn
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+BLACK = RGBColor(0, 0, 0)
+RED = RGBColor(0xFF, 0x00, 0x00)
+# ----------------------------- text helpers -----------------------------
+def _find_table_with_headers(doc: Document, must_have: list[str]) -> Optional[Table]:
+    for t in doc.tables:
+        if not t.rows:
+            continue
+        head = canon(" ".join(cell_text(c) for c in t.rows[0].cells))
+        if all(canon_label(x) in head for x in must_have):
+            return t
+    return None
+def ensure_auditor_decl_headers(doc: Document) -> bool:
+    """
+    Second-last page table under 'NHVAS APPROVED AUDITOR DECLARATION'.
+    Force the HEADER row to read exactly:
+      [ Print Name | NHVR or Exemplar Global Auditor Registration Number ]
+    Never touch the bottom (values) row.
+    """
+    changed = False
+    expected_left  = "Print Name"
+    expected_right = "NHVR or Exemplar Global Auditor Registration Number"
+    for t in doc.tables:
+        if not t.rows or not t.rows[0].cells:
+            continue
+        # must look like the auditor table: header left says "Print Name", 2+ cols, 2+ rows
+        head_left = canon_label(cell_text(t.rows[0].cells[0]))
+        if head_left == "print name" and len(t.rows[0].cells) >= 2 and len(t.rows) >= 2:
+            # fix left header if needed
+            if canon_label(cell_text(t.rows[0].cells[0])) != canon_label(expected_left) or \
+               any(is_red_run(r) for p in t.rows[0].cells[0].paragraphs for r in p.runs):
+                _set_cell_text_black(t.rows[0].cells[0], expected_left)
+                changed = True
+            # unconditionally set the RIGHT header text (this is where "Peter Sheppard" was sitting)
+            if canon_label(cell_text(t.rows[0].cells[1])) != canon_label(expected_right) or \
+               any(is_red_run(r) for p in t.rows[0].cells[1].paragraphs for r in p.runs):
+                _set_cell_text_black(t.rows[0].cells[1], expected_right)
+                changed = True
+            # found and fixed the table; no need to continue
+            break
+    return changed
+def fill_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
+    """Last page table: write values ONLY into the bottom row (red placeholders)."""
+    t = _find_table_with_headers(doc, ["Print Name", "Position Title"])
+    if not t or len(t.rows) < 2 or len(t.rows[0].cells) < 2:
+        return False
+    bot_left  = t.rows[1].cells[0]
+    bot_right = t.rows[1].cells[1]
+    # only replace if that cell has a red placeholder
+    if any(is_red_run(r) for p in bot_left.paragraphs for r in p.runs):
+        _set_cell_text_black(bot_left, print_name)
+    if any(is_red_run(r) for p in bot_right.paragraphs for r in p.runs):
+        _set_cell_text_black(bot_right, position_title)
+    return True
+def find_heading_index_from_end(doc: Document, heading: str) -> Optional[int]:
+    key = canon(heading)
+    allp = iter_paragraphs(doc)
+    for i in range(len(allp) - 1, -1, -1):
+        if key in canon(para_text(allp[i])):
+            return i
+    return None
+def set_date_by_heading_from_end(doc: Document, heading: str, date_text: str, max_scan: int = 60) -> bool:
+    """Find the LAST occurrence of `heading`, then replace the FIRST red run in the next paragraphs."""
+    if not date_text:
+        return False
+    allp = iter_paragraphs(doc)
+    idx = find_heading_index_from_end(doc, heading)
+    if idx is None:
+        return False
+    for p in allp[idx + 1 : min(idx + 1 + max_scan, len(allp))]:
+        if replace_red_in_paragraph(p, date_text):  # writes in black
+            return True
+    return False
+def set_date_by_paragraph_from_end(doc: Document, paragraph_text: str, date_text: str, max_scan: int = 60) -> bool:
+    """Find the LAST paragraph matching `paragraph_text`, then set the FIRST red run after it."""
+    if not date_text:
+        return False
+    key = canon(paragraph_text)
+    allp = iter_paragraphs(doc)
+    hit = None
+    for i in range(len(allp) - 1, -1, -1):
+        if key in canon(para_text(allp[i])):
+            hit = i
+            break
+    if hit is None:
+        return False
+    # date placeholder is on the LAST page, right after this long paragraph
+    for p in allp[hit + 1 : min(hit + 1 + max_scan, len(allp))]:
+        if replace_red_in_paragraph(p, date_text):  # writes in black
+            return True
+    return False
+def set_layer3_name_after_management_heading(doc: Document, mid_heading: str, allowed_prev_titles: List[str], name: str) -> bool:
+    if not name:
+        return False
+    allp = iter_paragraphs(doc)
+    wrote = False
+    mid = canon(mid_heading)
+    allowed_prev = {canon(t) for t in allowed_prev_titles}
+    for i, p in enumerate(allp):
+        if canon(para_text(p)) != mid:
+            continue
+        # previous non-empty must be one of the allowed titles
+        j = i - 1
+        while j >= 0 and not nz(para_text(allp[j])):
+            j -= 1
+        if j < 0 or canon(para_text(allp[j])) not in allowed_prev:
+            continue
+        # next non-empty is the 3rd line we overwrite
+        k = i + 1
+        while k < len(allp) and not nz(para_text(allp[k])):
+            k += 1
+        if k >= len(allp):
+            continue
+        # compute target size from the middle heading; fall back to a sensible bump
+        target_size = _para_effective_font_size(allp[i]) or Pt(16)
+        _clear_para_and_write_black(allp[k], name)
+        # apply size to all runs explicitly (overrides style)
+        for r in allp[k].runs:
+            r.font.size = target_size
+        wrote = True
+    return wrote
+def _para_effective_font_size(p: Paragraph):
+    # try explicit run sizes first
+    for r in p.runs:
+        if r.font.size:
+            return r.font.size
+    # then the paragraph style
+    if p.style and p.style.font and p.style.font.size:
+        return p.style.font.size
+    return None
+# --- helpers for summary tables ---
+# --- helpers for summary overwrite ---
+def _std_key(s: str) -> str:
+    """
+    Normalize a label to match a 'Std N' key.
+    e.g. 'Std 7. Internal Review' -> 'std 7'
+    """
+    t = canon_label(s)
+    m = re.match(r"(std\s+\d+)", t)
+    return m.group(1) if m else t
+def _looks_like_summary_table(table: Table) -> Optional[Tuple[int, int]]:
+    """
+    Return (label_col_idx, details_col_idx) if this is a Summary table
+    with a DETAILS column; otherwise None.
+    """
+    if not table.rows:
+        return None
+    first = table.rows[0]
+    cols = len(first.cells)
+    if cols < 2:
+        return None
+    # header texts for first row
+    head = [canon(cell_text(c)) for c in first.cells]
+    # find DETAILS column
+    details_col = None
+    for j, t in enumerate(head):
+        if "detail" in t:
+            details_col = j
+            break
+    if details_col is None:
+        return None
+    # find the label column (left-hand standards column)
+    label_col = None
+    for j, t in enumerate(head):
+        if any(k in t for k in ["maintenance management", "mass management", "fatigue management"]):
+            label_col = j
+            break
+    if label_col is None:
+        # fallback: assume the first non-DETAILS column is the label column
+        label_col = 0 if details_col != 0 else 1
+    return (label_col, details_col)
+def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
+    """Heuristically count header rows (stop when first data row like '1.' appears)."""
+    for i, row in enumerate(table.rows[:scan_up_to]):
+        first = cell_text(row.cells[0]).strip()
+        if re.match(r"^\d+\.?$", first):
+            return i
+    return 1
+def _header_col_texts(table: Table, scan_rows: int = 5) -> List[str]:
+    scan_rows = min(scan_rows, len(table.rows))
+    if scan_rows == 0:
+        return []
+    # pick the row with the most cells as base
+    base_row = max(range(scan_rows), key=lambda i: len(table.rows[i].cells))
+    base_cols = len(table.rows[base_row].cells)
+    cols = []
+    for j in range(base_cols):
+        parts = []
+        for i in range(scan_rows):
+            row = table.rows[i]
+            if j < len(row.cells):
+                parts.append(cell_text(row.cells[j]))
+        cols.append(canon(" ".join(parts)))
+    return cols
+def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
+    """Header ends right before the first row whose 1st cell looks like '1.'"""
+    limit = min(scan_up_to, len(table.rows))
+    for i in range(limit):
+        first = cell_text(table.rows[i].cells[0]).strip()
+        if re.match(r"^\d+\.?$", first):
+            return i
+    # fallback to 1 header row
+    return 1
+def map_cols_mass_strict(table: Table) -> Dict[str, int]:
+    cols = _header_col_texts(table, 5)
+    def first_col(*needles):
+        for j, t in enumerate(cols):
+            if all(n in t for n in needles):
+                return j
+        return None
+    idx = {
+        "no":   first_col("no"),
+        "reg":  first_col("registration", "number") or first_col("registration"),
+        "wv":   first_col("weight", "verification"),
+        "rfs":  first_col("rfs", "cert") or first_col("rfs", "certification"),
+        "susp": first_col("suspension", "maintenance"),
+        "trip": first_col("trip", "record"),
+        "frs":  first_col("fault", "suspension") or first_col("fault", "reporting", "suspension"),
+    }
+    return {k: v for k, v in idx.items() if v is not None}
+def find_mass_vehicle_numbers_table(doc: Document) -> Optional[Table]:
+    """Pick the Mass vehicle-number table by matching its column set (not the Summary table)."""
+    best = None
+    best_score = -1
+    for t in iter_tables(doc):
+        cols = _header_col_texts(t, 5)
+        allhdr = " ".join(cols)
+        # must look like the vehicle numbers table
+        hits = 0
+        hits += int(any("registration" in c and "number" in c for c in cols))
+        hits += int(any("weight" in c and "verification" in c for c in cols))
+        hits += int(any("rfs" in c and ("cert" in c or "certification" in c) for c in cols))
+        hits += int(any("suspension" in c and "maintenance" in c for c in cols))
+        hits += int(any("trip" in c and "record" in c for c in cols))
+        hits += int(any("fault" in c and "suspension" in c for c in cols))
+        # reject obvious Summary tables
+        if "details" in allhdr:
+            continue
+        # prefer tables with numbering column and many rows
+        score = hits + (0.5 if any("no" == c or c.startswith("no ") for c in cols) else 0) + (len(t.rows) / 100.0)
+        if hits >= 4 and score > best_score:
+            best, best_score = t, score
+    return best
+def update_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
+    """
+    First try strict table label mapping for 'Print Name' and 'Position Title'.
+    If not found, fallback to the first two red placeholders under the 'Operator Declaration' heading.
+    """
+    changed = False
+    # 1) Table label approach
+    for lbl, val in (("Print Name", print_name), ("Position Title", position_title)):
+        if not val:
+            continue
+        loc = find_label_cell(doc, lbl)
+        if not loc:
+            # tolerate odd spacing/colon/camelcase
+            for alt in ("PrintName", "Print  Name", "Print Name:", "PositionTitle", "Position  Title", "Position Title:"):
+                loc = find_label_cell(doc, alt)
+                if loc:
+                    break
+        if loc:
+            t, r, c = loc
+            cell = get_adjacent_value_cell(t, r, c)
+            if not replace_red_in_cell(cell, val):
+                _set_cell_text_black(cell, val)
+            changed = True
+    if changed:
+        return True
+    # 2) Fallback: heading-scoped red placeholders
+    head = "OPERATOR DECLARATION"
+    p = find_heading_paragraph(doc, head) or find_heading_paragraph(doc, head.title())
+    if not p:
+        return False
+    allp = iter_paragraphs(doc)
+    try:
+        i = allp.index(p)
+    except ValueError:
+        i = 0
+    red_targets = []
+    for q in allp[i+1:i+1+20]:
+        reds = [r for r in q.runs if is_red_run(r)]
+        if reds:
+            red_targets.extend(reds)
+        if len(red_targets) >= 2:
+            break
+    wrote = False
+    if print_name and red_targets:
+        _set_text_and_black(red_targets[0], print_name); wrote = True
+    if position_title and len(red_targets) >= 2:
+        _set_text_and_black(red_targets[1], position_title); wrote = True
+    return wrote
+def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, List[str]]):
+    colmap = map_cols_mass_strict(table)
+    if "reg" not in colmap:
+        return
+    hdr_rows = count_header_rows(table, 6)
+    regs = arrays.get("Registration Number", [])
+    n = len(regs)
+    # clear data rows only
+    while len(table.rows) > hdr_rows:
+        table._tbl.remove(table.rows[-1]._tr)
+    # ensure enough rows
+    while len(table.rows) < hdr_rows + n:
+        table.add_row()
+    def put(row, key, arr_key, i):
+        if key in colmap:
+            vals = arrays.get(arr_key, [])
+            val = nz(vals[i]) if i < len(vals) else ""
+            replace_red_in_cell(row.cells[colmap[key]], val)
+    for i in range(n):
+        row = table.rows[hdr_rows + i]
+        replace_red_in_cell(row.cells[colmap["reg"]], nz(regs[i]))
+        put(row, "wv",   "Weight Verification Records", i)
+        put(row, "rfs",  "RFS Suspension Certification #", i)
+        put(row, "susp", "Suspension System Maintenance", i)
+        put(row, "trip", "Trip Records", i)
+        put(row, "frs",  "Fault Recording/ Reporting on Suspension System", i)
+def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
+    """For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
+    for each Std N row with the JSON text (written in black)."""
+    # build desired texts
+    desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
+    # pick which tables belong to this section by header sniff
+    wanted_prefix = canon_label(section_name.split()[0])  # "maintenance" | "mass" | "fatigue"
+    updated = 0
+    for t in doc.tables:
+        cols = _looks_like_summary_table(t)
+        if not cols:
+            continue
+        label_col, details_col = cols
+        head_txt = table_header_text(t, up_to_rows=2)
+        if wanted_prefix not in head_txt:   # keep to the correct section
+            continue
+        # walk body rows
+        for i in range(1, len(t.rows)):
+            row = t.rows[i]
+            key = _std_key(cell_text(row.cells[label_col]))
+            # exact match or "std N" prefix match
+            cand = desired.get(key)
+            if not cand:
+                m = re.match(r"(std\s+\d+)", key)
+                if m:
+                    for k2, v2 in desired.items():
+                        if k2.startswith(m.group(1)):
+                            cand = v2
+                            break
+            if not cand:
+                continue
+            _set_cell_text_black(row.cells[details_col], cand)  # full overwrite, black
+            updated += 1
+    return updated
+SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
+ORDINAL_DATE_PAT = re.compile(r"\b(\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4})\b", re.I)
+def split_sentences_keep(text: str) -> List[str]:
+    s = " ".join(str(text or "").split())
+    if not s:
+        return []
+    out = []
+    start = 0
+    for m in SPLIT_SENT_PAT.finditer(s):
+        out.append(s[start:m.start()].strip())
+        start = m.end()
+    last = s[start:].strip()
+    if last:
+        out.append(last)
+    return out
+_sent_split = re.compile(r'(?<=[.!?])\s+|\n+')
+_date_pat   = re.compile(r'\b(?:\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4}|\d{1,2}/\d{1,2}/\d{2,4}|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b')
+def extract_summary_snippets(desired_text: str):
+    sents = _sentences(desired_text)
+    dates = [m.group(0) for m in _date_pat.finditer(desired_text)]
+    pick  = lambda rx: next((s for s in sents if re.search(rx, s, re.I)), None)
+    return {
+        "sheet_sent": pick(r'\b(daily\s+check|sheet)\b'),
+        "sheet_phrase": _extract_sheet_phrase_from_desired(desired_text),
+        "review":  pick(r'\binternal\s+review\b'),
+        "qcs":     pick(r'\bquarterly\b.*\bcompliance\b') or pick(r'\bquarterly\b'),
+        "dates":   dates,
+        "sents":   sents,
+    }
+def fill_management_summary_tables(doc: Document, section_key: str, section_data: Dict[str, List[str]]):
+    """
+    Fill ALL summary tables for the given section_key ('maintenance'|'mass'|'fatigue')
+    by matching each row label (left column) against keys in section_data and
+    patching only the red text inside the DETAILS cell.
+    """
+    targets = [x for x in find_all_summary_tables(doc) if x[0] == section_key]
+    if not targets:
+        return
+    # build list of (normalized label, original label, desired_text)
+    desired = []
+    for label, vals in section_data.items():
+        want = canon_label(label)
+        if not want:
+            continue
+        desired.append((want, label, join_value(vals)))
+    for _, table, lcol, dcol in targets:
+        # iterate data rows (skip header)
+        for i in range(1, len(table.rows)):
+            left_txt_norm = canon_label(cell_text(table.rows[i].cells[lcol]))
+            if not left_txt_norm:
+                continue
+            for want_norm, _orig_lbl, value in desired:
+                # loose contains match handles minor punctuation differences
+                if want_norm and want_norm in left_txt_norm:
+                    patch_details_cell_from_json(table.rows[i].cells[dcol], value)
+def _set_text_and_black(run, new_text: str):
+    """Replace a run's text and force color to black (clears theme color too)."""
+    if new_text is None:
+        new_text = ""
+    run.text = str(new_text)
+    run.font.color.rgb = BLACK
+    try:
+        # clear any theme color so rgb sticks
+        run.font.color.theme_color = None
+    except Exception:
+        pass
+def update_business_summary_once(doc: Document, value) -> bool:
+    """Replace only the red summary paragraph; keep 'Accreditation Number' and 'Expiry Date' lines."""
+    loc = (find_label_cell(doc, "Nature of the Operators Business (Summary)")
+           or find_label_cell(doc, "Nature of the Operators Business (Summary):"))
+    if not loc:
+        return False
+    t, r, c = loc
+    cell = get_adjacent_value_cell(t, r, c)
+    if not cell.paragraphs:
+        cell.add_paragraph("")
+    txt = join_value(value)
+    # find paragraphs with any red runs (the placeholders for the summary)
+    red_paras = [p for p in cell.paragraphs if any(is_red_run(run) for run in p.runs)]
+    if red_paras:
+        # write the summary into the first red paragraph (in black)
+        _clear_para_and_write_black(red_paras[0], txt)
+        # clear any extra red placeholders
+        for p in red_paras[1:]:
+            _clear_para_and_write_black(p, "")
+    else:
+        # no red placeholder found: just put the summary into the first paragraph, leave others
+        _clear_para_and_write_black(cell.paragraphs[0], txt)
+    return True
+def _nuke_cell_paragraphs(cell: _Cell):
+    """Remove ALL paragraphs from a cell (true delete, not just emptying runs)."""
+    for p in list(cell.paragraphs):
+        p._element.getparent().remove(p._element)
+def _clear_para_and_write_black(paragraph, text: str):
+    """Clear a whole paragraph and write fresh black text."""
+    # wipe existing runs
+    for r in list(paragraph.runs):
+        r.text = ""
+    r = paragraph.add_run(str(text or ""))
+    r.font.color.rgb = BLACK
+    try:
+        r.font.color.theme_color = None
+    except Exception:
+        pass
+def _set_cell_text_black(cell, text: str):
+    """Clear a table cell and insert black text."""
+    # remove text from all runs in all paragraphs
+    for p in cell.paragraphs:
+        for r in p.runs:
+            r.text = ""
+    p = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
+    r = p.add_run(str(text or ""))
+    r.font.color.rgb = BLACK
+    try:
+        r.font.color.theme_color = None
+    except Exception:
+        pass
+def nz(x: Optional[str]) -> str:
+    return (x or "").strip()
+def canon(s: str) -> str:
+    s = re.sub(r"\s+", " ", str(s)).strip().lower()
+    s = s.replace("–", "-").replace("—", "-")
+    return re.sub(r"[^a-z0-9/#()+,.\- ]+", "", s)
+def canon_label(s: str) -> str:
+    # labels often vary by punctuation/casing; keep digits/letters
+    s = re.sub(r"\s+", " ", str(s)).strip().lower()
+    s = s.replace("–", "-").replace("—", "-")
+    s = re.sub(r"[^a-z0-9 ]+", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+def join_value(value) -> str:
+    if isinstance(value, list):
+        # Keep multi-line when list provided
+        return "\n".join([str(v) for v in value if nz(v)])
+    return str(value)
+def split_digits(s: str) -> List[str]:
+    return re.findall(r"\d", s)
+def para_text(p: Paragraph) -> str:
+    return "".join(run.text for run in p.runs)
+def cell_text(c: _Cell) -> str:
+    return "\n".join(para_text(p) for p in c.paragraphs)
+def is_red_run(run) -> bool:
+    col = run.font.color
+    if not col:
+        return False
+    if col.rgb is not None:
+        return col.rgb == RED
+    # Some templates use theme colors; treat explicit red text snippets only
+    return False
+def replace_red_in_paragraph(p: Paragraph, new_text: str) -> bool:
+    replaced = False
+    red_runs = [r for r in p.runs if is_red_run(r)]
+    if not red_runs:
+        return False
+    # collapse all red runs into one and write value (in black)
+    first = red_runs[0]
+    _set_text_and_black(first, new_text)
+    for r in red_runs[1:]:
+        r.text = ""
+    replaced = True
+    return replaced
+def replace_red_in_cell(cell: _Cell, new_text: str) -> bool:
+    # replace only red runs; if none, replace whole cell with a single run (fallback)
+    any_red = False
+    for p in cell.paragraphs:
+        if replace_red_in_paragraph(p, new_text):
+            any_red = True
+    if any_red:
+        return True
+    # fallback: clear cell, set single paragraph text in black
+    _set_cell_text_black(cell, new_text)
+    return True
+def parse_attendance_lines(value) -> List[str]:
+    """
+    Parse strings like:
+      "Peter Sheppard - Compliance Greg Dyer - Auditor"
+    into:
+      ["Peter Sheppard - Compliance", "Greg Dyer - Auditor"]
+    Handles lists, newlines, semicolons, and pipes too.
+    """
+    if isinstance(value, list):
+        s = " ".join(str(v) for v in value if v)
+    else:
+        s = str(value or "")
+    s = re.sub(r"\s+", " ", s).strip()
+    if not s:
+        return []
+    # First split on explicit separators; then within each chunk, extract Name - Title pairs.
+    chunks = re.split(r"\s*[\n;|]\s*", s)
+    items: List[str] = []
+    pair_pat = re.compile(
+        r"([A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3})\s*-\s*"
+        r"([^-\n]+?)(?=\s+[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3}\s*-\s*|$)"
+    )
+    for chunk in chunks:
+        chunk = chunk.strip()
+        if not chunk:
+            continue
+        found = False
+        for m in pair_pat.finditer(chunk):
+            name = m.group(1).strip()
+            title = m.group(2).strip()
+            items.append(f"{name} - {title}")
+            found = True
+        if not found:
+            # Fallback: single "Name - Title"
+            if " - " in chunk:
+                a, b = chunk.split(" - ", 1)
+                items.append(f"{a.strip()} - {b.strip()}")
+            elif chunk:
+                items.append(chunk)
+    return items
+def fill_attendance_block(doc: Document, value) -> bool:
+    items = parse_attendance_lines(value)
+    if not items:
+        return False
+    loc = find_label_cell(doc, "Attendance List (Names and Position Titles)")
+    if not loc:
+        return False
+    t, r, c = loc
+    # value cell: usually directly under the heading cell
+    target = (
+        t.rows[r + 1].cells[c]
+        if r + 1 < len(t.rows) and c < len(t.rows[r + 1].cells)
+        else get_adjacent_value_cell(t, r, c)
+    )
+    # ---- read ONLY the target cell (don’t touch the row)
+    def is_red_para(p): return any(is_red_run(run) for run in p.runs)
+    def looks_like_pair(s: str) -> bool:
+        if " - " not in s: return False
+        a, b = s.split(" - ", 1)
+        return bool(a.strip()) and bool(b.strip())
+    paras = list(target.paragraphs)
+    red_count = sum(1 for p in paras if is_red_para(p))
+    existing_black = [para_text(p).strip() for p in paras
+                      if (not is_red_para(p)) and looks_like_pair(para_text(p))]
+    # compose final lines
+    out_lines: List[str] = []
+    out_lines.extend(items[:red_count])          # replace red placeholders
+    out_lines.extend(existing_black)             # keep black lines
+    norm = lambda s: re.sub(r"\s+", " ", s.strip().lower())
+    seen = {norm(x) for x in out_lines}
+    for extra in items[red_count:]:
+        k = norm(extra)
+        if k not in seen:
+            out_lines.append(extra); seen.add(k)
+    # ---- hard clear target cell and write fresh (all black)
+    _nuke_cell_paragraphs(target)
+    # first line
+    p = target.add_paragraph()
+    _clear_para_and_write_black(p, out_lines[0] if out_lines else "")
+    # remaining lines
+    for line in out_lines[1:]:
+        p = target.add_paragraph()
+        _clear_para_and_write_black(p, line)
+    return True
+# ----------------------------- document search -----------------------------
+def iter_tables(doc: Document) -> List[Table]:
+    return list(doc.tables)
+def iter_paragraphs(doc: Document) -> List[Paragraph]:
+    # paragraphs at doc level + inside tables
+    out = list(doc.paragraphs)
+    for t in doc.tables:
+        for row in t.rows:
+            for cell in row.cells:
+                out.extend(cell.paragraphs)
+    return out
+def find_heading_paragraph(doc: Document, heading_text: str, window: int = 60) -> Optional[Paragraph]:
+    key = canon(heading_text)
+    for p in iter_paragraphs(doc):
+        if canon(para_text(p)).startswith(key):
+            return p
+    # fuzzy contains
+    for p in iter_paragraphs(doc):
+        if key in canon(para_text(p)):
+            return p
+    return None
+def find_label_cell_in_table(table: Table, label: str) -> Optional[Tuple[int, int]]:
+    target = canon_label(label)
+    for r_i, row in enumerate(table.rows):
+        for c_i, cell in enumerate(row.cells):
+            if canon_label(cell_text(cell)) == target:
+                return (r_i, c_i)
+    # allow contains (safe-ish)
+    for r_i, row in enumerate(table.rows):
+        for c_i, cell in enumerate(row.cells):
+            if target and target in canon_label(cell_text(cell)):
+                return (r_i, c_i)
+    return None
+def find_label_cell(doc: Document, label: str) -> Optional[Tuple[Table, int, int]]:
+    for t in iter_tables(doc):
+        pos = find_label_cell_in_table(t, label)
+        if pos:
+            return (t, pos[0], pos[1])
+    return None
+def get_adjacent_value_cell(table: Table, r: int, c: int) -> _Cell:
+    # Prefer right cell, otherwise next row same col, otherwise this cell
+    cols = len(table.rows[0].cells)
+    if c + 1 < cols:
+        return table.rows[r].cells[c+1]
+    if r + 1 < len(table.rows):
+        return table.rows[r+1].cells[c]
+    return table.rows[r].cells[c]
+# ----------------------------- label/value updates -----------------------------
+def update_label_value_in_tables(doc: Document, label: str, value) -> bool:
+    tup = find_label_cell(doc, label)
+    val = join_value(value)
+    if not tup:
+        return False
+    t, r, c = tup
+    target_cell = get_adjacent_value_cell(t, r, c)
+    return replace_red_in_cell(target_cell, val)
+def update_heading_followed_red(doc: Document, heading: str, value, max_scan: int = 12) -> bool:
+    """Find heading paragraph, then replace the first red run found within next N paragraphs (including inside tables)"""
+    start = find_heading_paragraph(doc, heading)
+    if not start:
+        return False
+    # Build a linear list of paragraphs across whole doc to get an index
+    allp = iter_paragraphs(doc)
+    try:
+        idx = allp.index(start)
+    except ValueError:
+        idx = 0
+    new_text = join_value(value)
+    # Scan forward
+    for p in allp[idx+1: idx+1+max_scan]:
+        if replace_red_in_paragraph(p, new_text):
+            return True
+        # Also check any red in table cells inside this paragraph's parent (already covered via iter_paragraphs)
+    return False
+# ----------------------------- ACN per-digit fill -----------------------------
+def fill_acn_digits(doc: Document, acn_value: str) -> bool:
+    digits = split_digits(acn_value)
+    if not digits:
+        return False
+    loc = find_label_cell(doc, "Australian Company Number")
+    if not loc:
+        return False
+    t, r, c = loc
+    # Collect cells to the RIGHT in the same row first
+    targets: List[_Cell] = [t.rows[r].cells[j] for j in range(c + 1, len(t.rows[r].cells))]
+    # If not enough, continue row-by-row below (left→right)
+    rr = r + 1
+    while len(targets) < len(digits) and rr < len(t.rows):
+        targets.extend(list(t.rows[rr].cells))
+        rr += 1
+    targets = targets[:len(digits)]
+    if not targets:
+        return False
+    # Clear each target cell and write ONE digit in black
+    for d, cell in zip(digits, targets):
+        _set_cell_text_black(cell, d)
+    return True
+# ----------------------------- vehicle tables -----------------------------
+def table_header_text(table: Table, up_to_rows: int = 3) -> str:
+    heads = []
+    for i, row in enumerate(table.rows[:up_to_rows]):
+        for cell in row.cells:
+            heads.append(cell_text(cell))
+    return canon(" ".join(heads))
+def find_vehicle_table(doc: Document, want: str) -> Optional[Table]:
+    """
+    want = "maintenance" or "mass"
+    """
+    MAINT_KEYS = ["registration number", "maintenance records", "daily checks", "fault recording", "fault repair"]
+    MASS_KEYS  = ["registration number", "weight verification", "rfs suspension", "suspension system maintenance", "trip records", "reporting on suspension"]
+    candidates = []
+    for t in iter_tables(doc):
+        htxt = table_header_text(t)
+        if want == "maintenance":
+            if all(k in htxt for k in ["registration", "maintenance", "fault"]) and "suspension" not in htxt:
+                candidates.append(t)
+        elif want == "mass":
+            if "suspension" in htxt and "weight" in htxt:
+                candidates.append(t)
+    # Prefer the one with most rows
+    if not candidates:
+        return None
+    return max(candidates, key=lambda tb: len(tb.rows))
+def map_cols(table: Table, want: str) -> Dict[str, int]:
+    # map header columns by keywords from the first 2 rows that contain headers
+    header_rows = table.rows[:2]
+    col_texts = []
+    cols = len(table.rows[0].cells)
+    for j in range(cols):
+        txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
+        col_texts.append(canon(txt))
+    idx = {}
+    def first_col(*needles) -> Optional[int]:
+        for j, t in enumerate(col_texts):
+            if all(n in t for n in needles):
+                return j
+        return None
+    if want == "maintenance":
+        idx["reg"]   = first_col("registration")
+        idx["rw"]    = first_col("roadworthiness")
+        idx["mr"]    = first_col("maintenance", "records")
+        idx["daily"] = first_col("daily", "check")
+        idx["fr"]    = first_col("fault", "recording")
+        idx["rep"]   = first_col("fault", "repair")
+    else:
+        idx["reg"]   = first_col("registration")
+        idx["wv"]    = first_col("weight", "verification")
+        idx["rfs"]   = first_col("rfs", "cert")
+        idx["susp"]  = first_col("suspension", "maintenance")
+        idx["trip"]  = first_col("trip", "record")
+        idx["frs"]   = first_col("fault", "suspension")
+    return {k:v for k,v in idx.items() if v is not None}
+def clear_data_rows_keep_headers(table: Table, header_rows: int = 1):
+    # Keep first header_rows, drop everything else
+    while len(table.rows) > header_rows:
+        table._tbl.remove(table.rows[-1]._tr)
+def ensure_rows(table: Table, need_rows: int):
+    # assumes 1 header row; add rows to reach need_rows + 1 total
+    while len(table.rows) < need_rows + 1:
+        table.add_row()
+def fill_vehicle_table(table: Table, want: str, arrays: Dict[str, List[str]]):
+    colmap = map_cols(table, want)
+    if "reg" not in colmap:
+        return
+    if want == "maintenance":
+        regs = arrays.get("Registration Number", [])
+        rw   = arrays.get("Roadworthiness Certificates", [])
+        mr   = arrays.get("Maintenance Records", [])
+        daily= arrays.get("Daily Checks", [])
+        fr   = arrays.get("Fault Recording/ Reporting", [])
+        rep  = arrays.get("Fault Repair", [])
+        n = len(regs)
+        # keep header row(s), then fill N rows
+        clear_data_rows_keep_headers(table, header_rows=1)
+        ensure_rows(table, n)
+        for i in range(n):
+            row = table.rows[i+1]
+            def put(col_key, vals):
+                if col_key not in colmap or i >= len(vals): return
+                c = row.cells[colmap[col_key]]
+                replace_red_in_cell(c, nz(vals[i]))
+            # write each col
+            c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
+            put("rw",   rw)
+            put("mr",   mr)
+            put("daily",daily)
+            put("fr",   fr)
+            put("rep",  rep)
+    else:
+        regs = arrays.get("Registration Number", [])
+        wv   = arrays.get("Weight Verification Records", [])
+        rfs  = arrays.get("RFS Suspension Certification #", [])
+        susp = arrays.get("Suspension System Maintenance", [])
+        trip = arrays.get("Trip Records", [])
+        frs  = arrays.get("Fault Recording/ Reporting on Suspension System", [])
+        n = len(regs)
+        clear_data_rows_keep_headers(table, header_rows=1)
+        ensure_rows(table, n)
+        for i in range(n):
+            row = table.rows[i+1]
+            def put(col_key, vals):
+                if col_key not in colmap or i >= len(vals): return
+                c = row.cells[colmap[col_key]]
+                replace_red_in_cell(c, nz(vals[i]))
+            c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
+            put("wv",   wv)
+            put("rfs",  rfs)
+            put("susp", susp)
+            put("trip", trip)
+            put("frs",  frs)
+# ----------------------------- driver table -----------------------------
+def find_driver_table(doc: Document) -> Optional[Table]:
+    for t in iter_tables(doc):
+        h = table_header_text(t)
+        if "driver / scheduler" in h and ("fit for duty" in h or "work diary" in h):
+            return t
+    return None
+def map_driver_cols(table: Table) -> Dict[str,int]:
+    header_rows = table.rows[:2]
+    cols = len(table.rows[0].cells)
+    col_texts = []
+    for j in range(cols):
+        txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
+        col_texts.append(canon(txt))
+    idx = {}
+    def first_col(*needles):
+        for j, t in enumerate(col_texts):
+            if all(n in t for n in needles):
+                return j
+        return None
+    idx["name"]  = first_col("driver", "name")
+    idx["roster"]= first_col("roster", "safe")
+    idx["fit"]   = first_col("fit for duty")
+    # Work diary might be split across two headers; match "work diary" OR "electronic work diary"
+    wd = first_col("work diary") or first_col("electronic work diary")
+    if wd is not None: idx["wd"] = wd
+    return {k:v for k,v in idx.items() if v is not None}
+def fill_driver_table(table: Table, arrays: Dict[str, List[str]]):
+    colmap = map_driver_cols(table)
+    if not colmap:
+        return
+    names   = arrays.get("Driver / Scheduler Name", [])
+    rosters = arrays.get("Roster / Schedule / Safe Driving Plan (Date Range)", [])
+    fit     = arrays.get("Fit for Duty Statement Completed (Yes/No)", [])
+    wd      = arrays.get("Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)", [])
+    n = max(len(rosters), len(fit), len(wd), len(names))
+    clear_data_rows_keep_headers(table, header_rows=1)
+    ensure_rows(table, n)
+    has_any_name = any(str(x).strip() for x in names)
+    for i in range(n):
+        row = table.rows[i+1]
+        if "name" in colmap and has_any_name:
+            replace_red_in_cell(row.cells[colmap["name"]], names[i] if i < len(names) else "")
+        if "roster" in colmap:
+            replace_red_in_cell(row.cells[colmap["roster"]], rosters[i] if i < len(rosters) else "")
+        if "fit" in colmap:
+            replace_red_in_cell(row.cells[colmap["fit"]],     fit[i] if i < len(fit) else "")
+        if "wd" in colmap:
+            replace_red_in_cell(row.cells[colmap["wd"]],      wd[i]  if i < len(wd)  else "")
+# ----------------------------- main mapping -----------------------------
+def flatten_simple_sections(data: Dict) -> Dict[str, str]:
+    """Collect simple label->single value mappings from top-level sections other than tables."""
+    out = {}
+    skip_sections = {
+        "Vehicle Registration Numbers Maintenance",
+        "Vehicle Registration Numbers Mass",
+        "Driver / Scheduler Records Examined",
+        "paragraphs",
+        "Attendance List (Names and Position Titles)",
+        "Nature of the Operators Business (Summary)",
+        "Maintenance Management Summary",
+        "Mass Management Summary",
+        "Fatigue Management Summary",
+    }
+    for sec, kv in data.items():
+        if sec in skip_sections: continue
+        if not isinstance(kv, dict): continue
+        for label, val in kv.items():
+            out[f"{sec}::{label}"] = join_value(val)
+    return out
+def run(input_json: Path, template_docx: Path, output_docx: Path):
+    with open(input_json, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    doc = Document(str(template_docx))
+    # 1) simple label/value tables
+    simple = flatten_simple_sections(data)
+    # Map by (section::label). We try: (a) find exact label cell somewhere and write in the adjacent cell;
+    # (b) if not found, search by heading then the next red run below the heading.
+    for k, v in simple.items():
+        # use the part after '::' as the label
+        label = k.split("::", 1)[1] if "::" in k else k
+        # SPECIAL: skip ACN here; we'll fill per-digit later
+        if canon_label(label) == "australian company number":
+            continue
+        ok = update_label_value_in_tables(doc, label, v)
+        if not ok:
+            sec = k.split("::", 1)[0] if "::" in k else k
+            update_heading_followed_red(doc, sec, v)
+            # 2) paragraphs block
+        paras = data.get("paragraphs", {})
+        # 2a) generic headings → replace next red (skip the 3 management headings here)
+        # third-line headings above the three tables
+        for head in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
+            name_val = join_value(paras.get(head, ""))
+            if name_val:
+                update_heading_followed_red(doc, head, name_val, max_scan=6)
+        # 2b) the 3-layer headings → overwrite the 3rd line only
+        # second-last page: date under page heading
+        aud_head = "NHVAS APPROVED AUDITOR DECLARATION"
+        aud_date = join_value(paras.get(aud_head, ""))
+        if aud_date:
+            set_date_by_heading_from_end(doc, aud_head, aud_date, max_scan=40)
+        # last page: date under the long acknowledgement paragraph
+        ack_head = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
+                    "I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
+        ack_date = join_value(paras.get(ack_head, ""))
+        if ack_date:
+            set_date_by_paragraph_from_end(doc, ack_head, ack_date, max_scan=40)
+        maint_name = join_value(paras.get("MAINTENANCE MANAGEMENT", ""))
+        if maint_name:
+            set_layer3_name_after_management_heading(
+                doc,
+                "MAINTENANCE MANAGEMENT",
+                ["Vehicle Registration Numbers of Records Examined"],
+                maint_name,
+            )
+        mass_name = join_value(paras.get("MASS MANAGEMENT", ""))
+        if mass_name:
+            set_layer3_name_after_management_heading(
+                doc,
+                "MASS MANAGEMENT",
+                ["Vehicle Registration Numbers of Records Examined"],
+                mass_name,
+            )
+        fat_name = join_value(paras.get("FATIGUE MANAGEMENT", ""))
+        if fat_name:
+            set_layer3_name_after_management_heading(
+                doc,
+                "FATIGUE MANAGEMENT",
+                ["Driver / Scheduler Records Examined"],
+                fat_name,
+            )
+    # 3) ACN digits
+    op_info = data.get("Operator Information", {})
+    acn_val = join_value(op_info.get("Australian Company Number", ""))
+    if acn_val:
+        fill_acn_digits(doc, acn_val)
+    # 4) Vehicle tables
+    maint = data.get("Vehicle Registration Numbers Maintenance", {})
+    mass  = data.get("Vehicle Registration Numbers Mass", {})
+    t_m = find_vehicle_table(doc, "maintenance")
+    if t_m and maint:
+        fill_vehicle_table(t_m, "maintenance", maint)
+    t_ms = find_mass_vehicle_numbers_table(doc)
+    if t_ms and mass:
+        fill_mass_vehicle_table_preserve_headers(t_ms, mass)
+    # 5) Driver table
+    drivers = data.get("Driver / Scheduler Records Examined", {})
+    t_d = find_driver_table(doc)
+    if t_d and drivers:
+        fill_driver_table(t_d, drivers)
+    # 6) Special: Audit Declaration dates via heading
+    decl = data.get("Audit Declaration dates", {})
+    if decl.get("Audit was conducted on"):
+        update_heading_followed_red(doc, "Audit was conducted on", decl["Audit was conducted on"])
+    # 7) Operator Declaration (last page, bottom row only), and fix Auditor table header
+    op_decl = data.get("Operator Declaration", {})
+    if op_decl:
+        fill_operator_declaration(
+            doc,
+            join_value(op_decl.get("Print Name", "")),
+            join_value(op_decl.get("Position Title", "")),
+        )
+    # make sure the second-last page “NHVAS APPROVED AUDITOR DECLARATION” header row is labels
+    ensure_auditor_decl_headers(doc)
+    # 8) Attendance List
+    # Attendance: replace red lines only
+    atts = data.get("Attendance List (Names and Position Titles)", {})
+    att_val = atts.get("Attendance List (Names and Position Titles)")
+    if att_val:
+        fill_attendance_block(doc, att_val)
+    # 9) Nature of the Operators Business (Summary): write once (no duplicates)
+    biz = data.get("Nature of the Operators Business (Summary)", {})
+    if biz:
+        val = biz.get("Nature of the Operators Business (Summary):") or next(iter(biz.values()), "")
+        if val:
+            update_business_summary_once(doc, val)
+    # 10) Summary tables: FULL OVERWRITE of DETAILS from JSON
+    mm_sum = data.get("Maintenance Management Summary", {})
+    if mm_sum:
+        overwrite_summary_details_cells(doc, "Maintenance Management Summary", mm_sum)
+    mass_sum = data.get("Mass Management Summary", {})
+    if mass_sum:
+        overwrite_summary_details_cells(doc, "Mass Management Summary", mass_sum)
+    fat_sum = data.get("Fatigue Management Summary", {})
+    if fat_sum:
+        overwrite_summary_details_cells(doc, "Fatigue Management Summary", fat_sum)
+    doc.save(str(output_docx))
+# ----------------------------- CLI -----------------------------
+if __name__ == "__main__":
+    import sys
+    from pathlib import Path
+    if len(sys.argv) != 4:
+        print("Usage: python updated_word.py <json> <template.docx> <output.docx>")
+        sys.exit(1)
+    a, b, c = map(Path, sys.argv[1:4])
+    files = [a, b, c]
+    json_path = next((p for p in files if p.suffix.lower() == ".json"), None)
+    docx_paths = [p for p in files if p.suffix.lower() == ".docx"]
+    if not json_path or len(docx_paths) < 2:
+        print("Error: provide one .json and two .docx (template + output).")
+        sys.exit(1)
+    # Template = the .docx that already exists; Output = the other .docx
+    template_docx = next((p for p in docx_paths if p.exists()), docx_paths[0])
+    output_docx = docx_paths[1] if docx_paths[0] == template_docx else docx_paths[0]
+    run(json_path, template_docx, output_docx)

src/adapters/__init__.py ADDED Viewed

File without changes

src/adapters/infrastructure/__init__.py ADDED Viewed

File without changes

src/adapters/infrastructure/format_conversion_service_adapter.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from domain.PdfImages import PdfImages
+from domain.PdfSegment import PdfSegment
+from ports.services.format_conversion_service import FormatConversionService
+from adapters.infrastructure.format_converters.convert_table_to_html import extract_table_format
+from adapters.infrastructure.format_converters.convert_formula_to_latex import extract_formula_format
+class FormatConversionServiceAdapter(FormatConversionService):
+    def convert_table_to_html(self, pdf_images: PdfImages, segments: list[PdfSegment]) -> None:
+        extract_table_format(pdf_images, segments)
+    def convert_formula_to_latex(self, pdf_images: PdfImages, segments: list[PdfSegment]) -> None:
+        extract_formula_format(pdf_images, segments)

src/adapters/infrastructure/format_converters/__init__.py ADDED Viewed

File without changes

src/adapters/infrastructure/format_converters/convert_formula_to_latex.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from PIL.Image import Image
+from pix2tex.cli import LatexOCR
+from domain.PdfImages import PdfImages
+from domain.PdfSegment import PdfSegment
+from pdf_token_type_labels import TokenType
+import latex2mathml.converter
+def has_arabic(text: str) -> bool:
+    return any("\u0600" <= char <= "\u06FF" or "\u0750" <= char <= "\u077F" for char in text)
+def is_valid_latex(formula: str) -> bool:
+    try:
+        latex2mathml.converter.convert(formula)
+        return True
+    except Exception:
+        return False
+def extract_formula_format(pdf_images: PdfImages, predicted_segments: list[PdfSegment]):
+    formula_segments = [segment for segment in predicted_segments if segment.segment_type == TokenType.FORMULA]
+    if not formula_segments:
+        return
+    model = LatexOCR()
+    model.args.temperature = 1e-8
+    for formula_segment in formula_segments:
+        if has_arabic(formula_segment.text_content):
+            continue
+        page_image: Image = pdf_images.pdf_images[formula_segment.page_number - 1]
+        left, top = formula_segment.bounding_box.left, formula_segment.bounding_box.top
+        right, bottom = formula_segment.bounding_box.right, formula_segment.bounding_box.bottom
+        left = int(left * pdf_images.dpi / 72)
+        top = int(top * pdf_images.dpi / 72)
+        right = int(right * pdf_images.dpi / 72)
+        bottom = int(bottom * pdf_images.dpi / 72)
+        formula_image = page_image.crop((left, top, right, bottom))
+        formula_result = model(formula_image)
+        if not is_valid_latex(formula_result):
+            continue
+        formula_segment.text_content = f"$${formula_result}$$"

src/adapters/infrastructure/format_converters/convert_table_to_html.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from PIL import Image
+from domain.PdfImages import PdfImages
+from domain.PdfSegment import PdfSegment
+from pdf_token_type_labels import TokenType
+from rapidocr import RapidOCR
+from rapid_table import ModelType, RapidTable, RapidTableInput
+def extract_table_format(pdf_images: PdfImages, predicted_segments: list[PdfSegment]):
+    table_segments = [segment for segment in predicted_segments if segment.segment_type == TokenType.TABLE]
+    if not table_segments:
+        return
+    input_args = RapidTableInput(model_type=ModelType["SLANETPLUS"])
+    ocr_engine = RapidOCR()
+    table_engine = RapidTable(input_args)
+    for table_segment in table_segments:
+        page_image: Image = pdf_images.pdf_images[table_segment.page_number - 1]
+        left, top = table_segment.bounding_box.left, table_segment.bounding_box.top
+        right, bottom = table_segment.bounding_box.right, table_segment.bounding_box.bottom
+        left = int(left * pdf_images.dpi / 72)
+        top = int(top * pdf_images.dpi / 72)
+        right = int(right * pdf_images.dpi / 72)
+        bottom = int(bottom * pdf_images.dpi / 72)
+        table_image = page_image.crop((left, top, right, bottom))
+        ori_ocr_res = ocr_engine(table_image)
+        if not ori_ocr_res.txts:
+            continue
+        ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+        table_result = table_engine(table_image, ocr_results=ocr_results)
+        table_segment.text_content = table_result.pred_html

src/adapters/infrastructure/html_conversion_service_adapter.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import Optional, Union
+from starlette.responses import Response
+from domain.SegmentBox import SegmentBox
+from ports.services.html_conversion_service import HtmlConversionService
+from adapters.infrastructure.markup_conversion.pdf_to_markup_service_adapter import PdfToMarkupServiceAdapter
+from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
+class HtmlConversionServiceAdapter(HtmlConversionService, PdfToMarkupServiceAdapter):
+    def __init__(self):
+        PdfToMarkupServiceAdapter.__init__(self, OutputFormat.HTML)
+    def convert_to_html(
+        self,
+        pdf_content: bytes,
+        segments: list[SegmentBox],
+        extract_toc: bool = False,
+        dpi: int = 120,
+        output_file: Optional[str] = None,
+    ) -> Union[str, Response]:
+        return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)

src/adapters/infrastructure/markdown_conversion_service_adapter.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import Optional, Union
+from starlette.responses import Response
+from domain.SegmentBox import SegmentBox
+from ports.services.markdown_conversion_service import MarkdownConversionService
+from adapters.infrastructure.markup_conversion.pdf_to_markup_service_adapter import PdfToMarkupServiceAdapter
+from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
+class MarkdownConversionServiceAdapter(MarkdownConversionService, PdfToMarkupServiceAdapter):
+    def __init__(self):
+        PdfToMarkupServiceAdapter.__init__(self, OutputFormat.MARKDOWN)
+    def convert_to_markdown(
+        self,
+        pdf_content: bytes,
+        segments: list[SegmentBox],
+        extract_toc: bool = False,
+        dpi: int = 120,
+        output_file: Optional[str] = None,
+    ) -> Union[str, Response]:
+        return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)

src/adapters/infrastructure/markup_conversion/ExtractedImage.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pydantic import BaseModel
+class ExtractedImage(BaseModel):
+    image_data: bytes
+    filename: str

src/adapters/infrastructure/markup_conversion/Link.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from pydantic import BaseModel
+from domain.SegmentBox import SegmentBox
+class Link(BaseModel):
+    source_segment: SegmentBox
+    destination_segment: SegmentBox
+    text: str

src/adapters/infrastructure/markup_conversion/OutputFormat.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from enum import StrEnum
+class OutputFormat(StrEnum):
+    HTML = "html"
+    MARKDOWN = "markdown"

src/adapters/infrastructure/markup_conversion/__init__.py ADDED Viewed

File without changes

src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import fitz
+import tempfile
+import zipfile
+import io
+import json
+from fitz import Page
+from pathlib import Path
+from typing import Optional, Union
+from PIL.Image import Image
+from pdf2image import convert_from_path
+from starlette.responses import Response
+from domain.SegmentBox import SegmentBox
+from pdf_features.PdfFeatures import PdfFeatures
+from pdf_features.PdfToken import PdfToken
+from pdf_features.Rectangle import Rectangle
+from pdf_token_type_labels.Label import Label
+from pdf_token_type_labels.PageLabels import PageLabels
+from pdf_token_type_labels.PdfLabels import PdfLabels
+from pdf_token_type_labels.TokenType import TokenType
+from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
+from adapters.infrastructure.markup_conversion.Link import Link
+from adapters.infrastructure.markup_conversion.ExtractedImage import ExtractedImage
+class PdfToMarkupServiceAdapter:
+    def __init__(self, output_format: OutputFormat):
+        self.output_format = output_format
+    def convert_to_format(
+        self,
+        pdf_content: bytes,
+        segments: list[SegmentBox],
+        extract_toc: bool = False,
+        dpi: int = 120,
+        output_file: Optional[str] = None,
+    ) -> Union[str, Response]:
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
+            temp_file.write(pdf_content)
+            temp_pdf_path = Path(temp_file.name)
+        try:
+            extracted_images: list[ExtractedImage] = [] if output_file else None
+            user_base_name = Path(output_file).stem if output_file else None
+            content = self._generate_content(temp_pdf_path, segments, extract_toc, dpi, extracted_images, user_base_name)
+            if output_file:
+                return self._create_zip_response(content, extracted_images, output_file, segments)
+            return content
+        finally:
+            if temp_pdf_path.exists():
+                temp_pdf_path.unlink()
+    def _create_zip_response(
+        self,
+        content: str,
+        extracted_images: list[ExtractedImage],
+        output_filename: str,
+        segments: list[SegmentBox],
+    ) -> Response:
+        zip_buffer = io.BytesIO()
+        with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
+            zip_file.writestr(output_filename, content.encode("utf-8"))
+            if extracted_images:
+                base_name = Path(output_filename).stem
+                pictures_dir = f"{base_name}_pictures/"
+                for image in extracted_images:
+                    zip_file.writestr(f"{pictures_dir}{image.filename}", image.image_data)
+            base_name = Path(output_filename).stem
+            segmentation_filename = f"{base_name}_segmentation.json"
+            segmentation_data = self._create_segmentation_json(segments)
+            zip_file.writestr(segmentation_filename, segmentation_data)
+        zip_buffer.seek(0)
+        zip_filename = f"{Path(output_filename).stem}.zip"
+        return Response(
+            content=zip_buffer.getvalue(),
+            media_type="application/zip",
+            headers={"Content-Disposition": f"attachment; filename={zip_filename}"},
+        )
+    def _create_segmentation_json(self, segments: list[SegmentBox]) -> str:
+        segmentation_data = []
+        for segment in segments:
+            segmentation_data.append(segment.to_dict())
+        return json.dumps(segmentation_data, indent=4, ensure_ascii=False)
+    def _create_pdf_labels_from_segments(self, vgt_segments: list[SegmentBox]) -> PdfLabels:
+        page_numbers = sorted(set(segment.page_number for segment in vgt_segments))
+        page_labels: list[PageLabels] = []
+        for page_number in page_numbers:
+            segments_in_page = [s for s in vgt_segments if s.page_number == page_number]
+            labels: list[Label] = []
+            for segment in segments_in_page:
+                rect = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
+                label = Label.from_rectangle(rect, TokenType.from_text(segment.type).get_index())
+                labels.append(label)
+            page_labels.append(PageLabels(number=page_number, labels=labels))
+        return PdfLabels(pages=page_labels)
+    def _find_closest_segment(self, bounding_box: Rectangle, segments: list[SegmentBox]) -> Optional[SegmentBox]:
+        if not segments:
+            return None
+        def intersection_key(segment: SegmentBox) -> float:
+            segment_rect = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
+            return bounding_box.get_intersection_percentage(segment_rect)
+        closest = max(segments, key=intersection_key)
+        max_intersection = intersection_key(closest)
+        if max_intersection > 0:
+            return closest
+        candidates = [s for s in segments if s.top > bounding_box.top]
+        if not candidates:
+            return None
+        def distance_key(segment: SegmentBox) -> tuple[float, float]:
+            vertical_dist = segment.top - bounding_box.top
+            segment_center_x = segment.left + segment.width / 2
+            box_center_x = bounding_box.left + bounding_box.width / 2
+            horizontal_dist = abs(segment_center_x - box_center_x)
+            return (vertical_dist, horizontal_dist)
+        return min(candidates, key=distance_key)
+    def _get_link_segments(
+        self, link: dict, page: Page, segments_by_page: dict[int, list[SegmentBox]]
+    ) -> Optional[tuple[SegmentBox, SegmentBox]]:
+        rect = link["from"]
+        source_box = Rectangle.from_coordinates(rect[0], rect[1], rect[2], rect[3])
+        source_page_num = page.number + 1
+        source_segments = segments_by_page.get(source_page_num, [])
+        source_segment = self._find_closest_segment(source_box, source_segments)
+        if not source_segment:
+            return None
+        dest_page_num = link.get("page", -1) + 1
+        dest_segments = segments_by_page.get(dest_page_num, [])
+        if not dest_segments:
+            return None
+        if "to" not in link:
+            dest_box = Rectangle.from_coordinates(0, 0, 20, 20)
+        else:
+            dest = link["to"] * page.transformation_matrix
+            dest_box = Rectangle.from_coordinates(dest[0], dest[1], dest[0] + 20, dest[1] + 20)
+        dest_segment = self._find_closest_segment(dest_box, dest_segments)
+        if not dest_segment:
+            return None
+        return source_segment, dest_segment
+    def _extract_links_by_segments(
+        self, pdf_path: Path, vgt_segments: list[SegmentBox]
+    ) -> tuple[dict[SegmentBox, list[Link]], dict[SegmentBox, list[Link]]]:
+        links_by_source: dict[SegmentBox, list[Link]] = {}
+        links_by_dest: dict[SegmentBox, list[Link]] = {}
+        segments_by_page: dict[int, list[SegmentBox]] = {}
+        for segment in vgt_segments:
+            segments_by_page.setdefault(segment.page_number, []).append(segment)
+        doc = fitz.open(pdf_path)
+        try:
+            for page_num in range(len(doc)):
+                page: Page = doc[page_num]
+                links = page.get_links()
+                for link in links:
+                    if "page" not in link:
+                        continue
+                    rect = link["from"]
+                    text = page.get_text("text", clip=rect).strip()
+                    if not text:
+                        continue
+                    segments_pair = self._get_link_segments(link, page, segments_by_page)
+                    if not segments_pair:
+                        continue
+                    source, dest = segments_pair
+                    new_link = Link(source_segment=source, destination_segment=dest, text=text)
+                    links_by_source.setdefault(source, []).append(new_link)
+                    links_by_dest.setdefault(dest, []).append(new_link)
+        finally:
+            doc.close()
+        return links_by_source, links_by_dest
+    def _insert_reference_links(self, segment_text: str, links: list[Link]) -> str:
+        offset = 0
+        for link in links:
+            start_idx = segment_text.find(link.text, offset)
+            if start_idx == -1:
+                continue
+            escaped_text = link.text.replace("[", "\\[").replace("]", "\\]")
+            md_link = f"[{escaped_text}](#{link.destination_segment.id})"
+            segment_text = segment_text[:start_idx] + md_link + segment_text[start_idx + len(link.text) :]
+            offset = start_idx + len(md_link)
+        return segment_text
+    def _process_picture_segment(
+        self,
+        segment: SegmentBox,
+        pdf_images: list[Image],
+        pdf_path: Path,
+        picture_id: int,
+        dpi: int = 72,
+        extracted_images: Optional[list[ExtractedImage]] = None,
+        user_base_name: Optional[str] = None,
+    ) -> str:
+        if extracted_images is None:
+            return ""
+        segment_box = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
+        image = pdf_images[segment.page_number - 1]
+        left, top, right, bottom = segment_box.left, segment_box.top, segment_box.right, segment_box.bottom
+        if dpi != 72:
+            left = left * dpi / 72
+            top = top * dpi / 72
+            right = right * dpi / 72
+            bottom = bottom * dpi / 72
+        cropped = image.crop((left, top, right, bottom))
+        base_name = user_base_name if user_base_name else pdf_path.stem
+        image_name = f"{base_name}_{segment.page_number}_{picture_id}.png"
+        img_buffer = io.BytesIO()
+        cropped.save(img_buffer, format="PNG")
+        extracted_images.append(ExtractedImage(image_data=img_buffer.getvalue(), filename=image_name))
+        return f"<span id='{segment.id}'></span>\n" + f"<img src='{base_name}_pictures/{image_name}' alt=''>\n\n"
+    def _process_table_segment(self, segment: SegmentBox) -> str:
+        return f"<span id='{segment.id}'></span>\n" + segment.text + "\n\n"
+    def _get_token_content(self, token: PdfToken) -> str:
+        if self.output_format == OutputFormat.HTML:
+            return token.content_html
+        else:
+            return token.content_markdown
+    def _get_styled_content(self, token: PdfToken, content: str) -> str:
+        if self.output_format == OutputFormat.HTML:
+            styled = token.token_style.get_styled_content_html(content)
+            styled = token.token_style.script_type.get_styled_content(styled)
+            styled = token.token_style.list_level.get_styled_content_html(styled)
+            return token.token_style.hyperlink_style.get_styled_content_html(styled)
+        else:
+            styled = token.token_style.get_styled_content_markdown(content)
+            styled = token.token_style.script_type.get_styled_content(styled)
+            styled = token.token_style.list_level.get_styled_content_markdown(styled)
+            return token.token_style.hyperlink_style.get_styled_content_markdown(styled)
+    def _process_title_segment(self, tokens: list[PdfToken], segment: SegmentBox) -> str:
+        if not tokens:
+            return ""
+        title_type = tokens[0].token_style.title_type
+        content = " ".join([self._get_styled_content(token, token.content) for token in tokens])
+        if self.output_format == OutputFormat.HTML:
+            content = title_type.get_styled_content_html(content)
+        else:
+            content = title_type.get_styled_content_markdown(content)
+        anchor = f"<span id='{segment.id}'></span>\n"
+        return anchor + content + "\n\n"
+    def _process_regular_segment(
+        self,
+        tokens: list[PdfToken],
+        segment: SegmentBox,
+        links_by_source: dict[SegmentBox, list[Link]],
+        links_by_dest: dict[SegmentBox, list[Link]],
+    ) -> str:
+        if not tokens:
+            return ""
+        content = " ".join(self._get_token_content(t) for t in tokens)
+        if segment in links_by_source:
+            content = self._insert_reference_links(content, links_by_source[segment])
+        if segment in links_by_dest:
+            content = f"<span id='{segment.id}'></span>\n" + content
+        return content + "\n\n"
+    def _get_table_of_contents(self, vgt_segments: list[SegmentBox]) -> str:
+        title_segments = [s for s in vgt_segments if s.type in {TokenType.TITLE, TokenType.SECTION_HEADER}]
+        table_of_contents = "# Table of Contents\n\n"
+        for segment in title_segments:
+            if not segment.text.strip():
+                continue
+            first_word = segment.text.split()[0]
+            indentation = max(0, first_word.count(".") - 1)
+            content = "  " * indentation + "- [" + segment.text + "](#" + segment.id + ")\n"
+            table_of_contents += content
+        table_of_contents += "\n"
+        return table_of_contents + "\n\n"
+    def _set_segment_ids(self, vgt_segments: list[SegmentBox]) -> None:
+        segments_by_page: dict[int, list[SegmentBox]] = {}
+        for segment in vgt_segments:
+            segments_by_page.setdefault(segment.page_number, []).append(segment)
+        for page_number, segments in segments_by_page.items():
+            for segment_index, segment in enumerate(segments):
+                segment.id = f"page-{page_number}-{segment_index}"
+    def _generate_content(
+        self,
+        pdf_path: Path,
+        vgt_segments: list[SegmentBox],
+        extract_toc: bool = False,
+        dpi: int = 120,
+        extracted_images: Optional[list[ExtractedImage]] = None,
+        user_base_name: Optional[str] = None,
+    ) -> str:
+        pdf_labels: PdfLabels = self._create_pdf_labels_from_segments(vgt_segments)
+        pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path)
+        pdf_features.set_token_types(pdf_labels)
+        pdf_features.set_token_styles()
+        self._set_segment_ids(vgt_segments)
+        content_parts: list[str] = []
+        if extract_toc:
+            content_parts.append(self._get_table_of_contents(vgt_segments))
+        links_by_source, links_by_dest = self._extract_links_by_segments(pdf_path, vgt_segments)
+        picture_segments = [s for s in vgt_segments if s.type == TokenType.PICTURE]
+        pdf_images: list[Image] = convert_from_path(pdf_path, dpi=dpi) if picture_segments else []
+        for page in pdf_features.pages:
+            segments_in_page = [s for s in vgt_segments if s.page_number == page.page_number]
+            picture_id = 0
+            for segment in segments_in_page:
+                seg_box = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
+                tokens_in_seg = [t for t in page.tokens if t.bounding_box.get_intersection_percentage(seg_box) > 50]
+                if segment.type == TokenType.PICTURE:
+                    content_parts.append(
+                        self._process_picture_segment(
+                            segment, pdf_images, pdf_path, picture_id, dpi, extracted_images, user_base_name
+                        )
+                    )
+                    picture_id += 1
+                elif segment.type == TokenType.TABLE:
+                    content_parts.append(self._process_table_segment(segment))
+                elif segment.type in {TokenType.TITLE, TokenType.SECTION_HEADER}:
+                    content_parts.append(self._process_title_segment(tokens_in_seg, segment))
+                elif segment.type == TokenType.FORMULA:
+                    content_parts.append(segment.text + "\n\n")
+                else:
+                    content_parts.append(
+                        self._process_regular_segment(tokens_in_seg, segment, links_by_source, links_by_dest)
+                    )
+        return "".join(content_parts)

src/adapters/infrastructure/ocr/__init__.py ADDED Viewed

File without changes

src/adapters/infrastructure/ocr/languages.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import subprocess
+iso_to_tesseract = {
+    "af": "afr",  # Afrikaans
+    "all": "all",  # Allar
+    "am": "amh",  # Amharic
+    "ar": "ara",  # Arabic
+    "as": "asm",  # Assamese
+    "az": "aze",  # Azerbaijani
+    "aze-cyrl": "aze-cyrl",  # Azerbaijani (Cyrillic)
+    "be": "bel",  # Belarusian
+    "bn": "ben",  # Bangla
+    "bo": "bod",  # Tibetan
+    "bs": "bos",  # Bosnian
+    "br": "bre",  # Breton
+    "bg": "bul",  # Bulgarian
+    "ca": "cat",  # Catalan
+    "ceb": "ceb",  # Cebuano
+    "cs": "ces",  # Czech
+    "zh-Hans": "chi_sim",  # Chinese (Simplified)
+    "chi-sim-vert": "chi-sim-vert",  # Chinese (Simplified) vertical
+    "zh-Hant": "chi_tra",  # Chinese (Traditional)
+    "chi-tra-vert": "chi-tra-vert",  # Chinese (Traditional) vertical
+    "chr": "chr",  # Cherokee
+    "co": "cos",  # Corsican
+    "cy": "cym",  # Welsh
+    "da": "dan",  # Danish
+    "de": "deu",  # German
+    "dv": "div",  # Divehi
+    "dz": "dzo",  # Dzongkha
+    "el": "ell",  # Greek
+    "en": "eng",  # English
+    "enm": "enm",  # Middle English
+    "eo": "epo",  # Esperanto
+    "et": "est",  # Estonian
+    "eu": "eus",  # Basque
+    "fo": "fao",  # Faroese
+    "fa": "fas",  # Persian
+    "fil": "fil",  # Filipino
+    "fi": "fin",  # Finnish
+    "fr": "fra",  # French
+    "frk": "frk",  # Frankish
+    "frm": "frm",  # Middle French
+    "fy": "fry",  # Western Frisian
+    "gd": "gla",  # Scottish Gaelic
+    "ga": "gle",  # Irish
+    "gl": "glg",  # Galician
+    "grc": "grc",  # Ancient Greek
+    "gu": "guj",  # Gujarati
+    "ht": "hat",  # Haitian Creole
+    "he": "heb",  # Hebrew
+    "hi": "hin",  # Hindi
+    "hr": "hrv",  # Croatian
+    "hu": "hun",  # Hungarian
+    "hy": "hye",  # Armenian
+    "iu": "iku",  # Inuktitut
+    "id": "ind",  # Indonesian
+    "is": "isl",  # Icelandic
+    "it": "ita",  # Italian
+    "ita-old": "ita-old",  # Old Italian
+    "jv": "jav",  # Javanese
+    "ja": "jpn",  # Japanese
+    "jpn-vert": "jpn-vert",  # Japanese vertical
+    "kn": "kan",  # Kannada
+    "ka": "kat",  # Georgian
+    "kat-old": "kat-old",  # Old Georgian
+    "kk": "kaz",  # Kazakh
+    "km": "khm",  # Khmer
+    "ky": "kir",  # Kyrgyz
+    "kmr": "kmr",  # Northern Kurdish
+    "ko": "kor",  # Korean
+    "kor-vert": "kor_vert",  # Korean vertical
+    "lo": "lao",  # Lao
+    "la": "lat",  # Latin
+    "lv": "lav",  # Latvian
+    "lt": "lit",  # Lithuanian
+    "lb": "ltz",  # Luxembourgish
+    "ml": "mal",  # Malayalam
+    "mr": "mar",  # Marathi
+    "mk": "mkd",  # Macedonian
+    "mt": "mlt",  # Maltese
+    "mn": "mon",  # Mongolian
+    "mi": "mri",  # Māori
+    "ms": "msa",  # Malay
+    "my": "mya",  # Burmese
+    "ne": "nep",  # Nepali
+    "nl": "nld",  # Dutch
+    "no": "nor",  # Norwegian
+    "oc": "oci",  # Occitan
+    "or": "ori",  # Odia
+    "osd": "osd",  # Unknown language [osd]
+    "pa": "pan",  # Punjabi
+    "pl": "pol",  # Polish
+    "pt": "por",  # Portuguese
+    "ps": "pus",  # Pashto
+    "qu": "que",  # Quechua
+    "ro": "ron",  # Romanian
+    "ru": "rus",  # Russian
+    "sa": "san",  # Sanskrit
+    "script-arab": "script-arab",  # Arabic script
+    "script-armn": "script-armn",  # Armenian script
+    "script-beng": "script-beng",  # Bengali script
+    "script-cans": "script-cans",  # Canadian Aboriginal script
+    "script-cher": "script-cher",  # Cherokee script
+    "script-cyrl": "script-cyrl",  # Cyrillic script
+    "script-deva": "script-deva",  # Devanagari script
+    "script-ethi": "script-ethi",  # Ethiopic script
+    "script-frak": "script-frak",  # Frankish script
+    "script-geor": "script-geor",  # Georgian script
+    "script-grek": "script-grek",  # Greek script
+    "script-gujr": "script-gujr",  # Gujarati script
+    "script-guru": "script-guru",  # Gurmukhi script
+    "script-hang": "script-hang",  # Hangul script
+    "script-hang-vert": "script-hang-vert",  # Hangul script vertical
+    "script-hans": "script-hans",
+    "script-hans-vert": "script-hans-vert",
+    "script-hant": "script-hant",
+    "script-hant-vert": "script-hant-vert",
+    "script-hebr": "script-hebr",  # Hebrew script
+    "script-jpan": "script-jpan",  # Japanese script
+    "script-jpan-vert": "script-jpan-vert",  # Japanese script vertical
+    "script-khmr": "script-khmr",  # Khmer script
+    "script-knda": "script-knda",  # Kannada script
+    "script-laoo": "script-laoo",  # Lao script
+    "script-latn": "script-latn",
+    "script-mlym": "script-mlym",  # Malayalam script
+    "script-mymr": "script-mymr",  # Myanmar script
+    "script-orya": "script-orya",  # Odia script
+    "script-sinh": "script-sinh",  # Sinhala script
+    "script-syrc": "script-syrc",  # Syriac script
+    "script-taml": "script-taml",  # Tamil script
+    "script-telu": "script-telu",  # Telugu script
+    "script-thaa": "script-thaa",  # Thaana script
+    "script-thai": "script-thai",  # Thai script
+    "script-tibt": "script-tibt",  # Tibetan script
+    "script-viet": "script-viet",  # Vietnamese script
+    "si": "sin",  # Sinhala
+    "sk": "slk",  # Slovak
+    "sl": "slv",  # Slovenian
+    "sd": "snd",  # Sindhi
+    "es": "spa",  # Spanish
+    "spa-old": "spa-old",  # Old Spanish
+    "sq": "sqi",  # Albanian
+    "sr": "srp",  # Serbian
+    "srp-latn": "srp-latn",  # Serbian (Latin)
+    "su": "sun",  # Sundanese
+    "sw": "swa",  # Swahili
+    "sv": "swe",  # Swedish
+    "syr": "syr",  # Syriac
+    "ta": "tam",  # Tamil
+    "tt": "tat",  # Tatar
+    "te": "tel",  # Telugu
+    "tg": "tgk",  # Tajik
+    "th": "tha",  # Thai
+    "ti": "tir",  # Tigrinya
+    "to": "ton",  # Tongan
+    "tr": "tur",  # Turkish
+    "ug": "uig",  # Uyghur
+    "uk": "ukr",  # Ukrainian
+    "ur": "urd",  # Urdu
+    "uz": "uzb",  # Uzbek
+    "uzb-cyrl": "uzb-cyrl",  # Uzbek (Cyrillic)
+    "vi": "vie",  # Vietnamese
+    "yi": "yid",  # Yiddish
+    "yo": "yor",  # Yoruba
+}
+def supported_languages():
+    cmd = "tesseract --list-langs | grep -v osd | awk '{if(NR>1)print}'"
+    sp = subprocess.Popen(["/bin/bash", "-c", cmd], stdout=subprocess.PIPE)
+    tesseract_langs = [line.strip().decode("utf-8") for line in sp.stdout.readlines()]
+    inverted_iso_dict = {v: k for k, v in iso_to_tesseract.items()}
+    return list({tesseract_key: inverted_iso_dict[tesseract_key] for tesseract_key in tesseract_langs}.values())