Wasim commited on
Commit
2e237ce
·
1 Parent(s): df67c09

Sync: robust vehicle parser + full project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +5 -0
  2. .gitattributes +8 -8
  3. .github/FUNDING.yml +1 -0
  4. .github/dependabot.yml +17 -0
  5. .github/workflows/push_docker_image.yml +53 -0
  6. .github/workflows/test.yml +49 -0
  7. .gitignore +167 -0
  8. Dockerfile +55 -0
  9. LICENSE +201 -0
  10. Makefile +78 -0
  11. README.md +898 -28
  12. app.py +83 -9
  13. dev-requirements.txt +4 -0
  14. docker-compose-gpu.yml +14 -0
  15. docker-compose.yml +11 -0
  16. extract_pdf_data.py +528 -33
  17. extract_red_text.py +377 -72
  18. fine_tuning_lightgbm_models.ipynb +961 -0
  19. images/vgtexample1.png +3 -0
  20. images/vgtexample2.png +3 -0
  21. images/vgtexample3.png +3 -0
  22. images/vgtexample4.png +3 -0
  23. justfile +95 -0
  24. master_key.py +2 -1
  25. pyproject.toml +39 -0
  26. requirements.txt +27 -0
  27. space-pdf/README.md +910 -0
  28. space-pdf/app.py +124 -0
  29. space-pdf/extract_pdf_data.py +534 -0
  30. space-pdf/extract_red_text.py +764 -0
  31. space-pdf/master_key.py +372 -0
  32. space-pdf/packages.txt +2 -0
  33. space-pdf/requirements.txt +37 -0
  34. space-pdf/update_docx_with_pdf.py +1470 -0
  35. space-pdf/updated_word.py +1189 -0
  36. src/adapters/__init__.py +0 -0
  37. src/adapters/infrastructure/__init__.py +0 -0
  38. src/adapters/infrastructure/format_conversion_service_adapter.py +13 -0
  39. src/adapters/infrastructure/format_converters/__init__.py +0 -0
  40. src/adapters/infrastructure/format_converters/convert_formula_to_latex.py +43 -0
  41. src/adapters/infrastructure/format_converters/convert_table_to_html.py +33 -0
  42. src/adapters/infrastructure/html_conversion_service_adapter.py +23 -0
  43. src/adapters/infrastructure/markdown_conversion_service_adapter.py +23 -0
  44. src/adapters/infrastructure/markup_conversion/ExtractedImage.py +6 -0
  45. src/adapters/infrastructure/markup_conversion/Link.py +8 -0
  46. src/adapters/infrastructure/markup_conversion/OutputFormat.py +6 -0
  47. src/adapters/infrastructure/markup_conversion/__init__.py +0 -0
  48. src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py +361 -0
  49. src/adapters/infrastructure/ocr/__init__.py +0 -0
  50. src/adapters/infrastructure/ocr/languages.py +174 -0
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /venv/
2
+ /.venv/
3
+ .git
4
+ /detectron2/
5
+ /images/
.gitattributes CHANGED
@@ -1,8 +1,8 @@
1
- # Handle Python code and text files
2
- *.py text eol=lf
3
- *.md text eol=lf
4
- *.txt text eol=lf
5
-
6
- # Handle binary files
7
- *.pdf binary
8
- *.docx binary
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
4
+ *.pdf filter=lfs diff=lfs merge=lfs -text
5
+ *.pickle filter=lfs diff=lfs merge=lfs -text
6
+ *.pkl filter=lfs diff=lfs merge=lfs -text
7
+ *.pt filter=lfs diff=lfs merge=lfs -text
8
+ *.bin filter=lfs diff=lfs merge=lfs -text
.github/FUNDING.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ custom: ["https://huridocs.org/donate/"]
.github/dependabot.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "daily"
7
+ open-pull-requests-limit: 5
8
+ labels:
9
+ - "dependencies"
10
+ - package-ecosystem: "github-actions"
11
+ directory: "/"
12
+ schedule:
13
+ interval: "daily"
14
+ - package-ecosystem: "docker"
15
+ directory: "/"
16
+ schedule:
17
+ interval: "daily"
.github/workflows/push_docker_image.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create and publish Docker image
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ env:
9
+ REGISTRY: ghcr.io
10
+ IMAGE_NAME: huridocs/pdf-document-layout-analysis
11
+
12
+ jobs:
13
+ build-and-push-image:
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ contents: read
17
+ packages: write
18
+ steps:
19
+ - name: Checkout repository
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Install dependencies
23
+ run: sudo apt-get install -y just
24
+
25
+ - name: Log in to the Container registry
26
+ uses: docker/login-action@v3
27
+ with:
28
+ registry: ${{ env.REGISTRY }}
29
+ username: ${{ github.actor }}
30
+ password: ${{ secrets.GITHUB_TOKEN }}
31
+
32
+ - name: Extract metadata (tags, labels) for Docker
33
+ id: meta
34
+ uses: docker/metadata-action@v5
35
+ with:
36
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
37
+ tags: |
38
+ type=ref,event=branch
39
+ type=ref,event=pr
40
+ type=semver,pattern={{version}}
41
+ type=semver,pattern={{major}}.{{minor}}
42
+
43
+ - name: Create folder models
44
+ run: mkdir -p models
45
+
46
+ - name: Build and push
47
+ uses: docker/build-push-action@v6
48
+ with:
49
+ context: .
50
+ file: Dockerfile
51
+ push: ${{ github.event_name != 'pull_request' }}
52
+ tags: ${{ steps.meta.outputs.tags }}
53
+ labels: ${{ steps.meta.outputs.labels }}
.github/workflows/test.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3
+
4
+ name: Test
5
+
6
+ on:
7
+ push:
8
+ branches: [ main ]
9
+ pull_request:
10
+ branches: [ main ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - name: Set up Python 3.11
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: '3.11'
23
+
24
+ - name: Install dependencies
25
+ run: sudo apt-get update; sudo apt-get install -y pdftohtml qpdf just
26
+
27
+ - name: Free up space
28
+ run: just free_up_space
29
+
30
+ - name: Install venv
31
+ run: just install_venv
32
+
33
+ - name: Lint with black
34
+ run: just check_format
35
+
36
+ - name: Start service
37
+ run: just start_detached
38
+
39
+ - name: Check API ready
40
+ uses: emilioschepis/wait-for-endpoint@v1.0.3
41
+ with:
42
+ url: http://localhost:5060
43
+ method: GET
44
+ expected-status: 200
45
+ timeout: 120000
46
+ interval: 500
47
+
48
+ - name: Test with unittest
49
+ run: just test
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+ /models/
162
+ /word_grids/
163
+ /jsons/
164
+ /model_output/
165
+ /pdf_outputs/
166
+ /detectron2/
167
+ /ocr/
Dockerfile ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
2
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
3
+
4
+ RUN apt-get update
5
+ RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc
6
+
7
+
8
+ RUN apt-get install -y ocrmypdf
9
+ RUN apt-get install -y tesseract-ocr-fra
10
+ RUN apt-get install -y tesseract-ocr-spa
11
+ RUN apt-get install -y tesseract-ocr-deu
12
+ RUN apt-get install -y tesseract-ocr-ara
13
+ RUN apt-get install -y tesseract-ocr-mya
14
+ RUN apt-get install -y tesseract-ocr-hin
15
+ RUN apt-get install -y tesseract-ocr-tam
16
+ RUN apt-get install -y tesseract-ocr-tha
17
+ RUN apt-get install -y tesseract-ocr-chi-sim
18
+ RUN apt-get install -y tesseract-ocr-tur
19
+ RUN apt-get install -y tesseract-ocr-ukr
20
+ RUN apt-get install -y tesseract-ocr-ell
21
+ RUN apt-get install -y tesseract-ocr-rus
22
+ RUN apt-get install -y tesseract-ocr-kor
23
+ RUN apt-get install -y tesseract-ocr-kor-vert
24
+
25
+
26
+ RUN mkdir -p /app/src
27
+ RUN mkdir -p /app/models
28
+
29
+ RUN addgroup --system python && adduser --system --group python
30
+ RUN chown -R python:python /app
31
+ USER python
32
+
33
+ ENV VIRTUAL_ENV=/app/.venv
34
+ RUN python -m venv $VIRTUAL_ENV
35
+ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
36
+
37
+ COPY requirements.txt requirements.txt
38
+ RUN uv pip install --upgrade pip
39
+ RUN uv pip install -r requirements.txt
40
+
41
+ WORKDIR /app
42
+
43
+ RUN cd src; git clone https://github.com/facebookresearch/detectron2;
44
+ RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop
45
+ RUN uv pip install pycocotools==2.0.8
46
+
47
+ COPY ./start.sh ./start.sh
48
+ COPY ./src/. ./src
49
+ COPY ./models/. ./models/
50
+ RUN python src/download_models.py
51
+
52
+ ENV PYTHONPATH "${PYTHONPATH}:/app/src"
53
+ ENV TRANSFORMERS_VERBOSITY=error
54
+ ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
55
+
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2024-present HURIDOCS
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
Makefile ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
2
+
3
+ install:
4
+ . .venv/bin/activate; pip install -Ur requirements.txt
5
+
6
+ activate:
7
+ . .venv/bin/activate
8
+
9
+ install_venv:
10
+ python3 -m venv .venv
11
+ . .venv/bin/activate; python -m pip install --upgrade pip
12
+ . .venv/bin/activate; python -m pip install -r dev-requirements.txt
13
+
14
+ formatter:
15
+ . .venv/bin/activate; command black --line-length 125 .
16
+
17
+ check_format:
18
+ . .venv/bin/activate; command black --line-length 125 . --check
19
+
20
+ remove_docker_containers:
21
+ docker compose ps -q | xargs docker rm
22
+
23
+ remove_docker_images:
24
+ docker compose config --images | xargs docker rmi
25
+
26
+ start:
27
+ ifeq ($(OS), Windows_NT)
28
+ if not exist models mkdir models
29
+ else
30
+ mkdir -p ./models
31
+ endif
32
+ ifeq ($(HAS_GPU), 1)
33
+ @echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
34
+ docker compose -f docker-compose-gpu.yml up --build
35
+ else
36
+ @echo "No NVIDIA GPU detected, using docker-compose.yml"
37
+ docker compose -f docker-compose.yml up --build
38
+ endif
39
+
40
+
41
+ start_no_gpu:
42
+ mkdir -p ./models
43
+ docker compose up --build
44
+
45
+ stop:
46
+ docker compose stop
47
+
48
+ test:
49
+ . .venv/bin/activate; command cd src; command python -m pytest
50
+
51
+ free_up_space:
52
+ df -h
53
+ sudo rm -rf /usr/share/dotnet
54
+ sudo rm -rf /opt/ghc
55
+ sudo rm -rf "/usr/local/share/boost"
56
+ sudo rm -rf "$AGENT_TOOLSDIRECTORY"
57
+ sudo apt-get remove -y '^llvm-.*' || true
58
+ sudo apt-get remove -y 'php.*' || true
59
+ sudo apt-get remove -y google-cloud-sdk hhvm google-chrome-stable firefox mono-devel || true
60
+ sudo apt-get autoremove -y
61
+ sudo apt-get clean
62
+ sudo rm -rf /usr/share/dotnet
63
+ sudo rm -rf /usr/local/lib/android
64
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
65
+ sudo docker image prune --all --force
66
+ df -h
67
+
68
+
69
+ start_detached:
70
+ mkdir -p ./models
71
+ docker compose up --build -d
72
+
73
+ start_detached_gpu:
74
+ mkdir -p ./models
75
+ RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
76
+
77
+ upgrade:
78
+ . .venv/bin/activate; pip-upgrade
README.md CHANGED
@@ -1,40 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Audit Report Generator
3
- emoji: 📝
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.38.2
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- # NHVAS Audit Report Generator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- This tool automatically extracts relevant fields from an NHVAS PDF audit summary and populates a Word report template with the extracted data.
15
 
16
- ## Features
17
 
18
- - Upload an NHVAS PDF report
19
- - Upload your Word `.docx` report template
20
- - Automatically fills red-text placeholders in the Word document
21
- - Supports 7 module combinations (Mass, Maintenance, Fatigue, and their permutations)
22
- - Download the completed report instantly
23
 
24
- ## How to Use
25
 
26
- 1. Upload your **PDF audit report**.
27
- 2. Upload your **Word template (.docx)** with red-text placeholders.
28
- 3. Click **Generate Report**.
29
- 4. Download the updated Word document.
 
 
 
 
 
 
 
 
 
30
 
31
- ## Tech Stack
32
 
33
- - Python 🐍
34
- - Gradio UI (via Hugging Face Spaces)
35
- - PyMuPDF (for PDF parsing)
36
- - python-docx (for Word file editing)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- ## Author
39
 
40
- Built by Shami (Muhammad Ahtesham Ahmad)
 
1
+ <h1 align="center">PDF Document Layout Analysis</h1>
2
+ <p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
3
+
4
+ <p align="center">
5
+ <img src="https://img.shields.io/badge/Python-3.10+-blue.svg" alt="Python Version">
6
+ <img src="https://img.shields.io/badge/FastAPI-0.111.1-green.svg" alt="FastAPI">
7
+ <img src="https://img.shields.io/badge/Docker-Ready-blue.svg" alt="Docker">
8
+ <img src="https://img.shields.io/badge/GPU-Supported-orange.svg" alt="GPU Support">
9
+ </p>
10
+
11
+
12
+ <div align="center">
13
+ <p><strong>Built with ❤️ by <a href="https://huridocs.org">HURIDOCS</a></strong></p>
14
+ <p>
15
+ <a href="https://github.com/huridocs/pdf-document-layout-analysis">⭐ Star us on GitHub</a> •
16
+ <a href="https://hub.docker.com/r/huridocs/pdf-document-layout-analysis">🐳 Pull from Docker Hub</a> •
17
+ <a href="https://huggingface.co/HURIDOCS/pdf-document-layout-analysis">🤗 View on Hugging Face</a>
18
+ </p>
19
+ </div>
20
+
21
+
22
+
23
  ---
24
+
25
+ ## 🚀 Overview
26
+
27
+ This project provides a powerful and flexible PDF analysis microservice built with **Clean Architecture** principles. The service enables OCR, segmentation, and classification of different parts of PDF pages, identifying elements such as texts, titles, pictures, tables, formulas, and more. Additionally, it determines the correct reading order of these identified elements and can convert PDFs to various formats including Markdown and HTML.
28
+
29
+ ### ✨ Key Features
30
+
31
+ - 🔍 **Advanced PDF Layout Analysis** - Segment and classify PDF content with high accuracy
32
+ - 🖼️ **Visual & Fast Models** - Choose between VGT (Vision Grid Transformer) for accuracy or LightGBM for speed
33
+ - 📝 **Multi-format Output** - Export to JSON, Markdown, HTML, and visualize PDF segmentations
34
+ - 🌐 **OCR Support** - 150+ language support with Tesseract OCR
35
+ - 📊 **Table & Formula Extraction** - Extract tables as HTML and formulas as LaTeX
36
+ - 🏗️ **Clean Architecture** - Modular, testable, and maintainable codebase
37
+ - 🐳 **Docker-Ready** - Easy deployment with GPU support
38
+ - ⚡ **RESTful API** - Comprehensive API with 10+ endpoints
39
+
40
+ <table>
41
+ <tr>
42
+ <td>
43
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample1.png"/>
44
+ </td>
45
+ <td>
46
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample2.png"/>
47
+ </td>
48
+ <td>
49
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample3.png"/>
50
+ </td>
51
+ <td>
52
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample4.png"/>
53
+ </td>
54
+ </tr>
55
+ </table>
56
+
57
+ ### 🔗 Project Links
58
+
59
+ - **GitHub**: [pdf-document-layout-analysis](https://github.com/huridocs/pdf-document-layout-analysis)
60
+ - **HuggingFace**: [pdf-document-layout-analysis](https://huggingface.co/HURIDOCS/pdf-document-layout-analysis)
61
+ - **DockerHub**: [pdf-document-layout-analysis](https://hub.docker.com/r/huridocs/pdf-document-layout-analysis/)
62
+
63
  ---
64
 
65
+ ## 🚀 Quick Start
66
+
67
+ ### 1. Start the Service
68
+
69
+ **With GPU support (recommended for better performance):**
70
+ ```bash
71
+ make start
72
+ ```
73
+
74
+ **Without GPU support:**
75
+ ```bash
76
+ make start_no_gpu
77
+ ```
78
+
79
+ The service will be available at `http://localhost:5060`
80
+
81
+ **Check service status:**
82
+
83
+ ```bash
84
+ curl http://localhost:5060/info
85
+ ```
86
+
87
+ ### 2. Basic PDF Analysis
88
+
89
+ **Analyze a PDF document (VGT model - high accuracy):**
90
+ ```bash
91
+ curl -X POST -F 'file=@/path/to/your/document.pdf' http://localhost:5060
92
+ ```
93
+
94
+ **Fast analysis (LightGBM models - faster processing):**
95
+ ```bash
96
+ curl -X POST -F 'file=@/path/to/your/document.pdf' -F "fast=true" http://localhost:5060
97
+ ```
98
+
99
+ ### 3. Stop the Service
100
+
101
+ ```bash
102
+ make stop
103
+ ```
104
+
105
+ > 💡 **Tip**: Replace `/path/to/your/document.pdf` with the actual path to your PDF file. The service will return a JSON response with segmented content and metadata.
106
+
107
+
108
+ ## 📋 Table of Contents
109
+
110
+ - [🚀 Quick Start](#-quick-start)
111
+ - [⚙️ Dependencies](#-dependencies)
112
+ - [📋 Requirements](#-requirements)
113
+ - [📚 API Reference](#-api-reference)
114
+ - [💡 Usage Examples](#-usage-examples)
115
+ - [🏗️ Architecture](#-architecture)
116
+ - [🤖 Models](#-models)
117
+ - [📊 Data](#-data)
118
+ - [🔧 Development](#-development)
119
+ - [📈 Benchmarks](#-benchmarks)
120
+ - [Performance](#performance)
121
+ - [Speed](#speed)
122
+ - [🌐 Installation of More Languages for OCR](#-installation-of-more-languages-for-ocr)
123
+ - [🔗 Related Services](#-related-services)
124
+ - [🤝 Contributing](#-contributing)
125
+
126
+
127
+
128
+ ## ⚙️ Dependencies
129
+
130
+ ### Required
131
+ - **Docker Desktop 4.25.0+** - [Installation Guide](https://www.docker.com/products/docker-desktop/)
132
+ - **Python 3.10+** (for local development)
133
+
134
+ ### Optional
135
+ - **NVIDIA Container Toolkit** - [Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (for GPU support)
136
+
137
+ ## 📋 Requirements
138
+
139
+ ### System Requirements
140
+ - **RAM**: 2 GB minimum
141
+ - **GPU Memory**: 5 GB (optional, will fallback to CPU if unavailable)
142
+ - **Disk Space**: 10 GB for models and dependencies
143
+ - **CPU**: Multi-core recommended for better performance
144
+
145
+ ### Docker Requirements
146
+ - Docker Engine 20.10+
147
+ - Docker Compose 2.0+
148
+
149
+ ## 📚 API Reference
150
+
151
+ The service provides a comprehensive RESTful API with the following endpoints:
152
+
153
+ ### Core Analysis Endpoints
154
+
155
+ | Endpoint | Method | Description | Parameters |
156
+ |----------|--------|-------------|------------|
157
+ | `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
158
+ | `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
159
+ | `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
160
+
161
+ ### Content Extraction Endpoints
162
+
163
+ | Endpoint | Method | Description | Parameters |
164
+ |----------|--------|-------------|------------|
165
+ | `/text` | POST | Extract text by content types | `file`, `fast`, `types` |
166
+ | `/toc` | POST | Extract table of contents | `file`, `fast` |
167
+ | `/toc_legacy_uwazi_compatible` | POST | Extract TOC (Uwazi compatible) | `file` |
168
+
169
+ ### Format Conversion Endpoints
170
+
171
+ | Endpoint | Method | Description | Parameters |
172
+ |----------|--------|-------------|------------|
173
+ | `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
174
+ | `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
175
+ | `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
176
+
177
+ ### OCR & Utility Endpoints
178
+
179
+ | Endpoint | Method | Description | Parameters |
180
+ |----------|--------|-------------|------------|
181
+ | `/ocr` | POST | Apply OCR to PDF | `file`, `language` |
182
+ | `/info` | GET | Get service information | - |
183
+ | `/` | GET | Health check and system info | - |
184
+ | `/error` | GET | Test error handling | - |
185
+
186
+ ### Common Parameters
187
+
188
+ - **`file`**: PDF file to process (multipart/form-data)
189
+ - **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
190
+ - **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
191
+ - **`language`**: OCR language code (string, default: "en")
192
+ - **`types`**: Comma-separated content types to extract (string, default: "all")
193
+ - **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
194
+ - **`dpi`**: Image resolution for conversion (integer, default: 120)
195
+
196
+ ## 💡 Usage Examples
197
+
198
+ ### Basic PDF Analysis
199
+
200
+ **Standard analysis with VGT model:**
201
+ ```bash
202
+ curl -X POST \
203
+ -F 'file=@document.pdf' \
204
+ http://localhost:5060
205
+ ```
206
+
207
+ **Fast analysis with LightGBM models:**
208
+ ```bash
209
+ curl -X POST \
210
+ -F 'file=@document.pdf' \
211
+ -F 'fast=true' \
212
+ http://localhost:5060
213
+ ```
214
+
215
+ **Analysis with table and math parsing:**
216
+ ```bash
217
+ curl -X POST \
218
+ -F 'file=@document.pdf' \
219
+ -F 'parse_tables_and_math=true' \
220
+ http://localhost:5060
221
+ ```
222
+
223
+ ### Text Extraction
224
+
225
+ **Extract all text:**
226
+ ```bash
227
+ curl -X POST \
228
+ -F 'file=@document.pdf' \
229
+ -F 'types=all' \
230
+ http://localhost:5060/text
231
+ ```
232
+
233
+ **Extract specific content types:**
234
+ ```bash
235
+ curl -X POST \
236
+ -F 'file=@document.pdf' \
237
+ -F 'types=title,text,table' \
238
+ http://localhost:5060/text
239
+ ```
240
+
241
+ ### Format Conversion
242
+
243
+ **Convert to Markdown:**
244
+ ```bash
245
+ curl -X POST http://localhost:5060/markdown \
246
+ -F 'file=@document.pdf' \
247
+ -F 'extract_toc=true' \
248
+ -F 'output_file=document.md' \
249
+ --output 'document.zip'
250
+ ```
251
+
252
+ **Convert to HTML:**
253
+ ```bash
254
+ curl -X POST http://localhost:5060/html \
255
+ -F 'file=@document.pdf' \
256
+ -F 'extract_toc=true' \
257
+ -F 'output_file=document.html' \
258
+ --output 'document.zip'
259
+ ```
260
+
261
+ > **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
262
+ > - **Coordinates**: `left`, `top`, `width`, `height`
263
+ > - **Page information**: `page_number`, `page_width`, `page_height`
264
+ > - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
265
+
266
+
267
+ ### OCR Processing
268
+
269
+ **OCR in English:**
270
+ ```bash
271
+ curl -X POST \
272
+ -F 'file=@scanned_document.pdf' \
273
+ -F 'language=en' \
274
+ http://localhost:5060/ocr \
275
+ --output ocr_processed.pdf
276
+ ```
277
+
278
+ **OCR in other languages:**
279
+ ```bash
280
+ # French
281
+ curl -X POST \
282
+ -F 'file=@document_french.pdf' \
283
+ -F 'language=fr' \
284
+ http://localhost:5060/ocr \
285
+ --output ocr_french.pdf
286
+
287
+ # Spanish
288
+ curl -X POST \
289
+ -F 'file=@document_spanish.pdf' \
290
+ -F 'language=es' \
291
+ http://localhost:5060/ocr \
292
+ --output ocr_spanish.pdf
293
+ ```
294
+
295
+ ### Visualization
296
+
297
+ **Generate visualization PDF:**
298
+ ```bash
299
+ curl -X POST \
300
+ -F 'file=@document.pdf' \
301
+ http://localhost:5060/visualize \
302
+ --output visualization.pdf
303
+ ```
304
+
305
+ ### Table of Contents Extraction
306
+
307
+ **Extract structured TOC:**
308
+ ```bash
309
+ curl -X POST \
310
+ -F 'file=@document.pdf' \
311
+ http://localhost:5060/toc
312
+ ```
313
+
314
+ ### XML Storage and Retrieval
315
+
316
+ **Analyze and save XML:**
317
+ ```bash
318
+ curl -X POST \
319
+ -F 'file=@document.pdf' \
320
+ http://localhost:5060/save_xml/my_analysis
321
+ ```
322
+
323
+ **Retrieve saved XML:**
324
+ ```bash
325
+ curl http://localhost:5060/get_xml/my_analysis.xml
326
+ ```
327
+
328
+ ### Service Information
329
+
330
+ **Get service info and supported languages:**
331
+ ```bash
332
+ curl http://localhost:5060/info
333
+ ```
334
+
335
+ **Health check:**
336
+ ```bash
337
+ curl http://localhost:5060/
338
+ ```
339
+
340
+ ### Response Format
341
+
342
+ Most endpoints return JSON with segment information:
343
+
344
+ ```json
345
+ [
346
+ {
347
+ "left": 72.0,
348
+ "top": 84.0,
349
+ "width": 451.2,
350
+ "height": 23.04,
351
+ "page_number": 1,
352
+ "page_width": 595.32,
353
+ "page_height": 841.92,
354
+ "text": "Document Title",
355
+ "type": "Title"
356
+ },
357
+ {
358
+ "left": 72.0,
359
+ "top": 120.0,
360
+ "width": 451.2,
361
+ "height": 200.0,
362
+ "page_number": 1,
363
+ "page_width": 595.32,
364
+ "page_height": 841.92,
365
+ "text": "This is the main text content...",
366
+ "type": "Text"
367
+ }
368
+ ]
369
+ ```
370
+
371
+ ### Supported Content Types
372
+
373
+ - `Caption` - Image and table captions
374
+ - `Footnote` - Footnote text
375
+ - `Formula` - Mathematical formulas
376
+ - `List item` - List items and bullet points
377
+ - `Page footer` - Footer content
378
+ - `Page header` - Header content
379
+ - `Picture` - Images and figures
380
+ - `Section header` - Section headings
381
+ - `Table` - Table content
382
+ - `Text` - Regular text paragraphs
383
+ - `Title` - Document and section titles
384
+
385
+
386
+ ## 🏗️ Architecture
387
+
388
+ This project follows **Clean Architecture** principles, ensuring separation of concerns, testability, and maintainability. The codebase is organized into distinct layers:
389
+
390
+ ### Directory Structure
391
+
392
+ ```
393
+ src/
394
+ ├── domain/ # Enterprise Business Rules
395
+ │ ├── PdfImages.py # PDF image handling domain logic
396
+ │ ├── PdfSegment.py # PDF segment entity
397
+ │ ├── Prediction.py # ML prediction entity
398
+ │ └── SegmentBox.py # Core segment box entity
399
+ ├── use_cases/ # Application Business Rules
400
+ │ ├── pdf_analysis/ # PDF analysis use case
401
+ │ ├── text_extraction/ # Text extraction use case
402
+ │ ├── toc_extraction/ # Table of contents extraction
403
+ │ ├── visualization/ # PDF visualization use case
404
+ │ ├── ocr/ # OCR processing use case
405
+ │ ├── markdown_conversion/ # Markdown conversion use case
406
+ │ └── html_conversion/ # HTML conversion use case
407
+ ├── adapters/ # Interface Adapters
408
+ │ ├── infrastructure/ # External service adapters
409
+ │ ├── ml/ # Machine learning model adapters
410
+ │ ├── storage/ # File storage adapters
411
+ │ └── web/ # Web framework adapters
412
+ ├── ports/ # Interface definitions
413
+ │ ├── services/ # Service interfaces
414
+ │ └── repositories/ # Repository interfaces
415
+ └── drivers/ # Frameworks & Drivers
416
+ └── web/ # FastAPI application setup
417
+ ```
418
+
419
+ ### Layer Responsibilities
420
+
421
+ - **Domain Layer**: Contains core business entities and rules independent of external concerns
422
+ - **Use Cases Layer**: Orchestrates domain entities to fulfill specific application requirements
423
+ - **Adapters Layer**: Implements interfaces defined by inner layers and adapts external frameworks
424
+ - **Drivers Layer**: Contains frameworks, databases, and external agency configurations
425
+
426
+ ### Key Benefits
427
+
428
+ - 🔄 **Dependency Inversion**: High-level modules don't depend on low-level modules
429
+ - 🧪 **Testability**: Easy to unit test business logic in isolation
430
+ - 🔧 **Maintainability**: Changes to external frameworks don't affect business rules
431
+ - 📈 **Scalability**: Easy to add new features without modifying existing code
432
+
433
+
434
+ ## 🤖 Models
435
+
436
+ The service offers two complementary model approaches, each optimized for different use cases:
437
+
438
+ ### 1. Vision Grid Transformer (VGT) - High Accuracy Model
439
+
440
+ **Overview**: A state-of-the-art visual model developed by Alibaba Research Group that "sees" the entire page layout.
441
+
442
+ **Key Features**:
443
+ - 🎯 **High Accuracy**: Best-in-class performance on document layout analysis
444
+ - 👁️ **Visual Understanding**: Analyzes the entire page context including spatial relationships
445
+ - 📊 **Trained on DocLayNet**: Uses the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet)
446
+ - 🔬 **Research-Backed**: Based on [Advanced Literate Machinery](https://github.com/AlibabaResearch/AdvancedLiterateMachinery)
447
+
448
+ **Resource Requirements**:
449
+ - GPU: 5GB+ VRAM (recommended)
450
+ - CPU: Falls back automatically if GPU unavailable
451
+ - Processing Speed: ~1.75 seconds/page (GPU [GTX 1070]) or ~13.5 seconds/page (CPU [i7-8700])
452
+
453
+ ### 2. LightGBM Models - Fast & Efficient
454
+
455
+ **Overview**: Lightweight ensemble of two specialized models using XML-based features from Poppler.
456
+
457
+ **Key Features**:
458
+ - ⚡ **High Speed**: ~0.42 seconds per page on CPU (i7-8700)
459
+ - 💾 **Low Resource Usage**: CPU-only, minimal memory footprint
460
+ - 🔄 **Dual Model Approach**:
461
+ - **Token Type Classifier**: Identifies content types (title, text, table, etc.)
462
+ - **Segmentation Model**: Determines proper content boundaries
463
+ - 📄 **XML-Based**: Uses Poppler's PDF-to-XML conversion for feature extraction
464
+
465
+ **Trade-offs**:
466
+ - Slightly lower accuracy compared to VGT
467
+ - No visual context understanding
468
+ - Excellent for batch processing and resource-constrained environments
469
+
470
+ ### OCR Integration
471
+
472
+ Both models integrate seamlessly with OCR capabilities:
473
+
474
+ - **Engine**: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
475
+ - **Processing**: [ocrmypdf](https://ocrmypdf.readthedocs.io/en/latest/index.html)
476
+ - **Languages**: 150+ supported languages
477
+ - **Output**: Searchable PDFs with preserved layout
478
+
479
+ ### Model Selection Guide
480
+
481
+ | Use Case | Recommended Model | Reason |
482
+ |----------|------------------|---------|
483
+ | High accuracy requirements | VGT | Superior visual understanding |
484
+ | Batch processing | LightGBM | Faster processing, lower resources |
485
+ | GPU available | VGT | Leverages GPU acceleration |
486
+ | CPU-only environment | LightGBM | Optimized for CPU processing |
487
+ | Real-time applications | LightGBM | Consistent fast response times |
488
+ | Research/analysis | VGT | Best accuracy for detailed analysis |
489
+
490
+ ## 📊 Data
491
 
492
+ ### Training Dataset
493
 
494
+ Both model types are trained on the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet), a large-scale document layout analysis dataset containing over 80,000 document pages.
495
 
496
+ ### Document Categories
 
 
 
 
497
 
498
+ The models can identify and classify 11 distinct content types:
499
 
500
+ | ID | Category | Description |
501
+ |----|----------|-------------|
502
+ | 1 | **Caption** | Image and table captions |
503
+ | 2 | **Footnote** | Footnote references and text |
504
+ | 3 | **Formula** | Mathematical equations and formulas |
505
+ | 4 | **List item** | Bulleted and numbered list items |
506
+ | 5 | **Page footer** | Footer content and page numbers |
507
+ | 6 | **Page header** | Header content and titles |
508
+ | 7 | **Picture** | Images, figures, and graphics |
509
+ | 8 | **Section header** | Section and subsection headings |
510
+ | 9 | **Table** | Tabular data and structures |
511
+ | 10 | **Text** | Regular paragraph text |
512
+ | 11 | **Title** | Document and chapter titles |
513
 
514
+ ### Dataset Characteristics
515
 
516
+ - **Domain Coverage**: Academic papers, technical documents, reports
517
+ - **Language**: Primarily English with multilingual support
518
+ - **Quality**: High-quality annotations with bounding boxes and labels
519
+ - **Diversity**: Various document layouts, fonts, and formatting styles
520
+
521
+ For detailed information about the dataset, visit the [DocLayNet repository](https://github.com/DS4SD/DocLayNet).
522
+
523
+ ## 🔧 Development
524
+
525
+ ### Local Development Setup
526
+
527
+ 1. **Clone the repository:**
528
+ ```bash
529
+ git clone https://github.com/huridocs/pdf-document-layout-analysis.git
530
+ cd pdf-document-layout-analysis
531
+ ```
532
+
533
+ 2. **Create virtual environment:**
534
+ ```bash
535
+ make install_venv
536
+ ```
537
+
538
+ 3. **Activate environment:**
539
+ ```bash
540
+ make activate
541
+ # or manually: source .venv/bin/activate
542
+ ```
543
+
544
+ 4. **Install dependencies:**
545
+ ```bash
546
+ make install
547
+ ```
548
+
549
+ ### Code Quality
550
+
551
+ **Format code:**
552
+ ```bash
553
+ make formatter
554
+ ```
555
+
556
+ **Check formatting:**
557
+ ```bash
558
+ make check_format
559
+ ```
560
+
561
+ ### Testing
562
+
563
+ **Run tests:**
564
+ ```bash
565
+ make test
566
+ ```
567
+
568
+ **Integration tests:**
569
+ ```bash
570
+ # Tests are located in src/tests/integration/
571
+ python -m pytest src/tests/integration/test_end_to_end.py
572
+ ```
573
+
574
+ ### Docker Development
575
+
576
+ **Build and start (detached mode):**
577
+ ```bash
578
+ # With GPU
579
+ make start_detached_gpu
580
+
581
+ # Without GPU
582
+ make start_detached
583
+ ```
584
+
585
+ **Clean up Docker resources:**
586
+ ```bash
587
+ # Remove containers
588
+ make remove_docker_containers
589
+
590
+ # Remove images
591
+ make remove_docker_images
592
+ ```
593
+
594
+ ### Project Structure
595
+
596
+ ```
597
+ pdf-document-layout-analysis/
598
+ ├── src/ # Source code
599
+ │ ├── domain/ # Business entities
600
+ │ ├── use_cases/ # Application logic
601
+ │ ├── adapters/ # External integrations
602
+ │ ├── ports/ # Interface definitions
603
+ │ └── drivers/ # Framework configurations
604
+ ├── test_pdfs/ # Test PDF files
605
+ ├── models/ # ML model storage
606
+ ├── docker-compose.yml # Docker configuration
607
+ ├── Dockerfile # Container definition
608
+ ├── Makefile # Development commands
609
+ ├── pyproject.toml # Python project configuration
610
+ └── requirements.txt # Python dependencies
611
+ ```
612
+
613
+ ### Environment Variables
614
+
615
+ Key configuration options:
616
+
617
+ ```bash
618
+ # OCR configuration
619
+ OCR_SOURCE=/tmp/ocr_source
620
+
621
+ # Model paths (auto-configured)
622
+ MODELS_PATH=./models
623
+
624
+ # Service configuration
625
+ HOST=0.0.0.0
626
+ PORT=5060
627
+ ```
628
+
629
+ ### Adding New Features
630
+
631
+ 1. **Domain Logic**: Add entities in `src/domain/`
632
+ 2. **Use Cases**: Implement business logic in `src/use_cases/`
633
+ 3. **Adapters**: Create integrations in `src/adapters/`
634
+ 4. **Ports**: Define interfaces in `src/ports/`
635
+ 5. **Controllers**: Add endpoints in `src/adapters/web/`
636
+
637
+ ### Debugging
638
+
639
+ **View logs:**
640
+ ```bash
641
+ docker compose logs -f
642
+ ```
643
+
644
+ **Access container:**
645
+ ```bash
646
+ docker exec -it pdf-document-layout-analysis /bin/bash
647
+ ```
648
+
649
+ **Free up disk space:**
650
+ ```bash
651
+ make free_up_space
652
+ ```
653
+
654
+ ### Order of Output Elements
655
+
656
+ The service returns SegmentBox elements in a carefully determined reading order:
657
+
658
+ #### Reading Order Algorithm
659
+
660
+ 1. **Poppler Integration**: Uses [Poppler](https://poppler.freedesktop.org) PDF-to-XML conversion to establish initial token reading order
661
+ 2. **Segment Averaging**: Calculates average reading order for multi-token segments
662
+ 3. **Type-Based Sorting**: Prioritizes content types:
663
+ - **Headers** placed first
664
+ - **Main content** in reading order
665
+ - **Footers and footnotes** placed last
666
+
667
+ #### Non-Text Elements
668
+
669
+ For segments without text (e.g., images):
670
+ - Processed after text-based sorting
671
+ - Positioned based on nearest text segment proximity
672
+ - Uses spatial distance as the primary criterion
673
+
674
+ ### Advanced Table and Formula Extraction
675
+
676
+ #### Default Behavior
677
+ - **Formulas**: Automatically extracted as LaTeX format in the `text` property
678
+ - **Tables**: Basic text extraction included by default
679
+
680
+ #### Enhanced Table Extraction
681
+
682
+ Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
683
+
684
+ ```bash
685
+ curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
686
+ ```
687
+
688
+
689
+ #### Extraction Engines
690
+ - **Formulas**: [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
691
+ - **Tables**: [RapidTable](https://github.com/RapidAI/RapidTable)
692
+
693
+
694
+ ## 📈 Benchmarks
695
+
696
+ ### Performance
697
+
698
+ VGT model performance on PubLayNet dataset:
699
+
700
+ | Metric | Overall | Text | Title | List | Table | Figure |
701
+ |--------|---------|------|-------|------|-------|--------|
702
+ | **F1 Score** | **0.962** | 0.950 | 0.939 | 0.968 | 0.981 | 0.971 |
703
+
704
+ > 📊 **Comparison**: View comprehensive model comparisons at [Papers With Code](https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val)
705
+
706
+ ### Speed
707
+
708
+ Performance benchmarks on 15-page academic documents:
709
+
710
+ | Model | Hardware | Speed (sec/page) | Use Case |
711
+ |-------|----------|------------------|----------|
712
+ | **LightGBM** | CPU (i7-8700 3.2GHz) | **0.42** | Fast processing |
713
+ | **VGT** | GPU (GTX 1070) | **1.75** | High accuracy |
714
+ | **VGT** | CPU (i7-8700 3.2GHz) | 13.5 | CPU fallback |
715
+
716
+ ### Performance Recommendations
717
+
718
+ - **GPU Available**: Use VGT for best accuracy-speed balance
719
+ - **CPU Only**: Use LightGBM for optimal performance
720
+ - **Batch Processing**: LightGBM for consistent throughput
721
+ - **High Accuracy**: VGT with GPU for best results
722
+
723
+
724
+ ## 🌐 Installation of More Languages for OCR
725
+
726
+ The service uses Tesseract OCR with support for 150+ languages. The Docker image includes only common languages to minimize image size.
727
+
728
+ ### Installing Additional Languages
729
+
730
+ #### 1. Access the Container
731
+ ```bash
732
+ docker exec -it --user root pdf-document-layout-analysis /bin/bash
733
+ ```
734
+
735
+ #### 2. Install Language Packs
736
+ ```bash
737
+ # Install specific language
738
+ apt-get update
739
+ apt-get install tesseract-ocr-[LANGCODE]
740
+ ```
741
+
742
+ #### 3. Common Language Examples
743
+
744
+ ```bash
745
+ # Korean
746
+ apt-get install tesseract-ocr-kor
747
+
748
+ # German
749
+ apt-get install tesseract-ocr-deu
750
+
751
+ # French
752
+ apt-get install tesseract-ocr-fra
753
+
754
+ # Spanish
755
+ apt-get install tesseract-ocr-spa
756
+
757
+ # Chinese Simplified
758
+ apt-get install tesseract-ocr-chi-sim
759
+
760
+ # Arabic
761
+ apt-get install tesseract-ocr-ara
762
+
763
+ # Japanese
764
+ apt-get install tesseract-ocr-jpn
765
+ ```
766
+
767
+ #### 4. Verify Installation
768
+
769
+ ```bash
770
+ curl http://localhost:5060/info
771
+ ```
772
+
773
+ ### Language Code Reference
774
+
775
+ Find Tesseract language codes in the [ISO to Tesseract mapping](https://github.com/huridocs/pdf-document-layout-analysis/blob/main/src/adapters/infrastructure/ocr/languages.py).
776
+
777
+ ### Supported Languages
778
+
779
+ Common language codes:
780
+ - `eng` - English
781
+ - `fra` - French
782
+ - `deu` - German
783
+ - `spa` - Spanish
784
+ - `ita` - Italian
785
+ - `por` - Portuguese
786
+ - `rus` - Russian
787
+ - `chi-sim` - Chinese Simplified
788
+ - `chi-tra` - Chinese Traditional
789
+ - `jpn` - Japanese
790
+ - `kor` - Korean
791
+ - `ara` - Arabic
792
+ - `hin` - Hindi
793
+
794
+ ### Usage with Multiple Languages
795
+
796
+ ```bash
797
+ # OCR with specific language
798
+ curl -X POST \
799
+ -F 'file=@document.pdf' \
800
+ -F 'language=fr' \
801
+ http://localhost:5060/ocr \
802
+ --output french_ocr.pdf
803
+ ```
804
+
805
+
806
+ ## 🔗 Related Services
807
+
808
+ Explore our ecosystem of PDF processing services built on this foundation:
809
+
810
+ ### [PDF Table of Contents Extractor](https://github.com/huridocs/pdf-table-of-contents-extractor)
811
+ 🔍 **Purpose**: Intelligent extraction of structured table of contents from PDF documents
812
+
813
+ **Key Features**:
814
+ - Leverages layout analysis for accurate TOC identification
815
+ - Hierarchical structure recognition
816
+ - Multiple output formats supported
817
+ - Integration-ready API
818
+
819
+ ### [PDF Text Extraction](https://github.com/huridocs/pdf-text-extraction)
820
+ 📝 **Purpose**: Advanced text extraction with layout awareness
821
+
822
+ **Key Features**:
823
+ - Content-type aware extraction
824
+ - Preserves document structure
825
+ - Reading order optimization
826
+ - Clean text output with metadata
827
+
828
+ ### Integration Benefits
829
+
830
+ These services work seamlessly together:
831
+ - **Shared Analysis**: Reuse layout analysis results across services
832
+ - **Consistent Output**: Standardized JSON format for easy integration
833
+ - **Scalable Architecture**: Deploy services independently or together
834
+ - **Docker Ready**: All services containerized for easy deployment
835
+
836
+ ## 🤝 Contributing
837
+
838
+ We welcome contributions to improve the PDF Document Layout Analysis service!
839
+
840
+ ### How to Contribute
841
+
842
+ 1. **Fork the Repository**
843
+ ```bash
844
+ git clone https://github.com/your-username/pdf-document-layout-analysis.git
845
+ ```
846
+
847
+ 2. **Create a Feature Branch**
848
+ ```bash
849
+ git checkout -b feature/your-feature-name
850
+ ```
851
+
852
+ 3. **Set Up Development Environment**
853
+ ```bash
854
+ make install_venv
855
+ make install
856
+ ```
857
+
858
+ 4. **Make Your Changes**
859
+ - Follow the Clean Architecture principles
860
+ - Add tests for new features
861
+ - Update documentation as needed
862
+
863
+ 5. **Run Tests and Quality Checks**
864
+ ```bash
865
+ make test
866
+ make check_format
867
+ ```
868
+
869
+ 6. **Submit a Pull Request**
870
+ - Provide clear description of changes
871
+ - Include test results
872
+ - Reference any related issues
873
+
874
+ ### Contribution Guidelines
875
+
876
+ #### Code Standards
877
+ - **Python**: Follow PEP 8 with 125-character line length
878
+ - **Architecture**: Maintain Clean Architecture boundaries
879
+ - **Testing**: Include unit tests for new functionality
880
+ - **Documentation**: Update README and docstrings
881
+
882
+ #### Areas for Contribution
883
+
884
+ - 🐛 **Bug Fixes**: Report and fix issues
885
+ - ✨ **New Features**: Add new endpoints or functionality
886
+ - 📚 **Documentation**: Improve guides and examples
887
+ - 🧪 **Testing**: Expand test coverage
888
+ - 🚀 **Performance**: Optimize processing speed
889
+ - 🌐 **Internationalization**: Add language support
890
+
891
+ #### Development Workflow
892
+
893
+ 1. **Issue First**: Create or comment on relevant issues
894
+ 2. **Small PRs**: Keep pull requests focused and manageable
895
+ 3. **Clean Commits**: Use descriptive commit messages
896
+ 4. **Documentation**: Update relevant documentation
897
+ 5. **Testing**: Ensure all tests pass
898
+
899
+ ### Getting Help
900
+
901
+ - 📚 **Documentation**: Check this README and inline docs
902
+ - 💬 **Issues**: Search existing issues or create new ones
903
+ - 🔍 **Code**: Explore the codebase structure
904
+ - 📧 **Contact**: Reach out to maintainers for guidance
905
+
906
+ ---
907
 
908
+ ### License
909
 
910
+ This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
app.py CHANGED
@@ -3,34 +3,108 @@ import tempfile
3
  import os
4
  import shutil
5
  import subprocess
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def process_files(pdf_file, word_file):
8
  # Create a unique temporary directory for this run
9
  temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
 
10
 
11
  # Define standard filenames for use in the pipeline
12
  pdf_path = os.path.join(temp_dir, "input.pdf")
13
  word_path = os.path.join(temp_dir, "input.docx")
14
- pdf_txt_path = os.path.join(temp_dir, "pdf_data.txt")
15
  word_json_path = os.path.join(temp_dir, "word_data.json")
16
  updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
17
  final_docx_path = os.path.join(temp_dir, "updated.docx")
18
 
19
  # Copy the uploaded files to the temp directory
20
  shutil.copy(pdf_file, pdf_path)
 
21
  shutil.copy(word_file, word_path)
 
 
 
 
22
 
23
- # Step 1: Extract text from the PDF
24
- subprocess.run(["python", "extract_pdf_data.py", pdf_path, pdf_txt_path], check=True)
25
 
26
- # Step 2: Extract red text from the Word document
27
- subprocess.run(["python", "extract_red_text.py", word_path, word_json_path], check=True)
28
 
29
- # Step 3: Update the Word JSON using the PDF text (calls OpenAI)
30
- subprocess.run(["python", "update_docx_with_pdf.py", word_json_path, pdf_txt_path, updated_json_path], check=True)
31
 
32
- # Step 4: Apply the updated JSON to the Word doc to create the final output
33
- subprocess.run(["python", "updated_word.py", word_path, updated_json_path, final_docx_path], check=True)
34
 
35
  # Return the final .docx file
36
  return final_docx_path
 
3
  import os
4
  import shutil
5
  import subprocess
6
+ from pathlib import Path
7
+
8
+ SCRIPT_DIR = Path(__file__).resolve().parent
9
+
10
+ def run_cmd(cmd, cwd=None, env=None):
11
+ """Run a command, print nice logs, and also save them to run.log in cwd."""
12
+ cwd = str(cwd or os.getcwd())
13
+ print(f"🟦 Running: {' '.join(cmd)} (cwd={cwd})")
14
+ proc = subprocess.run(
15
+ cmd,
16
+ cwd=cwd,
17
+ env=env,
18
+ capture_output=True,
19
+ text=True
20
+ )
21
+ if proc.stdout:
22
+ print("🟩 STDOUT:")
23
+ print(proc.stdout)
24
+ if proc.stderr:
25
+ print("🟥 STDERR:")
26
+ print(proc.stderr)
27
+ # Save to run.log for debugging
28
+ try:
29
+ runlog = Path(cwd) / "run.log"
30
+ with open(runlog, "a", encoding="utf-8") as f:
31
+ f.write(f"$ {' '.join(cmd)}\n")
32
+ if proc.stdout:
33
+ f.write(proc.stdout + "\n")
34
+ if proc.stderr:
35
+ f.write(proc.stderr + "\n")
36
+ print(f"🧾 Run log saved to: {runlog}")
37
+ except Exception as e:
38
+ print(f"⚠️ Could not write run.log: {e}")
39
+
40
+ if proc.returncode != 0:
41
+ # Let Gradio see the failure so it surfaces properly
42
+ raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
43
+ return proc
44
+
45
+ def _locate_pdf_json(temp_dir: str) -> str:
46
+ """
47
+ Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
48
+ Find it (and a few common fallbacks). Raise if not found.
49
+ """
50
+ td = Path(temp_dir)
51
+
52
+ # Prefer exactly-named file if present
53
+ candidates = [
54
+ td / "pdf_data.json", # legacy name (if ever created)
55
+ td / "input_comprehensive_data.json", # most common from your logs
56
+ td / "comprehensive_data.json", # another common alias
57
+ td / "output.json", # generic
58
+ ]
59
+ for p in candidates:
60
+ if p.exists():
61
+ print(f"✅ Using PDF JSON: {p}")
62
+ return str(p)
63
+
64
+ # Generic pattern: anything *_comprehensive_data.json
65
+ globs = list(td.glob("*_comprehensive_data.json"))
66
+ if globs:
67
+ print(f"✅ Using PDF JSON (glob): {globs[0]}")
68
+ return str(globs[0])
69
+
70
+ # If still not found, surface a helpful error
71
+ searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
72
+ raise FileNotFoundError(
73
+ f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
74
+ )
75
 
76
  def process_files(pdf_file, word_file):
77
  # Create a unique temporary directory for this run
78
  temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
79
+ print(f"📂 Temp dir: {temp_dir}")
80
 
81
  # Define standard filenames for use in the pipeline
82
  pdf_path = os.path.join(temp_dir, "input.pdf")
83
  word_path = os.path.join(temp_dir, "input.docx")
 
84
  word_json_path = os.path.join(temp_dir, "word_data.json")
85
  updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
86
  final_docx_path = os.path.join(temp_dir, "updated.docx")
87
 
88
  # Copy the uploaded files to the temp directory
89
  shutil.copy(pdf_file, pdf_path)
90
+ print(f"📄 PDF copied to: {pdf_path}")
91
  shutil.copy(word_file, word_path)
92
+ print(f"📝 DOCX copied to: {word_path}")
93
+
94
+ # 1) PDF → JSON (extractor writes <stem>_comprehensive_data.json into cwd)
95
+ run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)
96
 
97
+ # Find the JSON produced by the extractor
98
+ pdf_json_path = _locate_pdf_json(temp_dir)
99
 
100
+ # 2) DOCX red text JSON
101
+ run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)
102
 
103
+ # 3) Merge JSON (uses the resolved pdf_json_path)
104
+ run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)
105
 
106
+ # 4) Apply updates to DOCX
107
+ run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)
108
 
109
  # Return the final .docx file
110
  return final_docx_path
dev-requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ -r requirements.txt
2
+ pytest==8.2.2
3
+ black==24.4.2
4
+ pip-upgrader==1.4.15
docker-compose-gpu.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ pdf-document-layout-analysis-gpu:
3
+ extends:
4
+ file: docker-compose.yml
5
+ service: pdf-document-layout-analysis
6
+ deploy:
7
+ resources:
8
+ reservations:
9
+ devices:
10
+ - driver: nvidia
11
+ count: 1
12
+ capabilities: [ gpu ]
13
+ environment:
14
+ - RESTART_IF_NO_GPU=$RESTART_IF_NO_GPU
docker-compose.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ pdf-document-layout-analysis:
3
+ container_name: pdf-document-layout-analysis
4
+ entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
5
+ init: true
6
+ restart: unless-stopped
7
+ build:
8
+ context: .
9
+ dockerfile: Dockerfile
10
+ ports:
11
+ - "5060:5060"
extract_pdf_data.py CHANGED
@@ -1,39 +1,534 @@
1
- import pdfplumber
2
- from pdf2image import convert_from_path
3
- import pytesseract
4
 
5
- def extract_pdf_full_text(pdf_path, txt_path):
6
- raw_texts = []
7
- need_ocr = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Step 1: Try to extract RAW text, record which pages need OCR
10
- with pdfplumber.open(pdf_path) as pdf:
11
- for i, page in enumerate(pdf.pages):
12
- print(f"Extracting text from page {i+1}...")
13
- text = page.extract_text() or ""
14
- if text.strip():
15
- raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
16
- else:
17
- raw_texts.append(None)
18
- # Mark that we need OCR for this page
19
- need_ocr.append(i)
20
 
21
- # Step 2: OCR only those pages with no RAW text
22
- print("Running OCR where RAW text is missing...")
23
- images = convert_from_path(pdf_path, dpi=300)
24
- for idx in need_ocr:
25
- ocr_text = pytesseract.image_to_string(images[idx])
26
- raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
27
 
28
- # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
29
- result = [txt for txt in raw_texts if txt]
30
- with open(txt_path, "w", encoding="utf-8") as f:
31
- f.write("\n".join(result))
32
- print(f"✅ Saved deduped full text to {txt_path}")
33
 
34
  if __name__ == "__main__":
35
- import sys
36
- # Usage: python extract_pdf_data.py input.pdf output.txt
37
- input_pdf = sys.argv[1]
38
- output_txt = sys.argv[2]
39
- extract_pdf_full_text(input_pdf, output_txt)
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fixed PDF Data Extractor - Addresses key issues in comprehensive_extract.py
4
 
5
+ Key fixes:
6
+ 1. Better table extraction and cleaning
7
+ 2. Improved key-value pair extraction
8
+ 3. More robust text processing
9
+ 4. Enhanced vehicle registration extraction
10
+ 5. Better date/number pattern recognition
11
+ """
12
+
13
+ import json
14
+ import re
15
+ import pandas as pd
16
+ from typing import Dict, List, Any, Optional
17
+ import logging
18
+ from pathlib import Path
19
+ import sys
20
+ from datetime import datetime
21
+
22
+ try:
23
+ import pdfplumber
24
+ HAS_PDFPLUMBER = True
25
+ except ImportError:
26
+ HAS_PDFPLUMBER = False
27
+
28
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
+ logger = logging.getLogger("fixed_pdf_extractor")
30
+
31
+ class FixedPDFExtractor:
32
+ def __init__(self):
33
+ logger.info("🚀 Initializing Fixed PDF Extractor")
34
+
35
+ def extract_everything(self, pdf_path: str) -> Dict[str, Any]:
36
+ if not HAS_PDFPLUMBER:
37
+ raise RuntimeError("pdfplumber is required. Install with: pip install pdfplumber")
38
+
39
+ logger.info(f"📖 Processing PDF: {pdf_path}")
40
+ result = {
41
+ "document_info": {
42
+ "filename": Path(pdf_path).name,
43
+ "total_pages": 0,
44
+ "extraction_timestamp": datetime.now().isoformat()
45
+ },
46
+ "extracted_data": {
47
+ "all_text_content": [],
48
+ "all_tables": [],
49
+ "key_value_pairs": {},
50
+ "audit_information": {},
51
+ "operator_information": {},
52
+ "vehicle_registrations": [],
53
+ "driver_records": [],
54
+ "compliance_summary": {},
55
+ "dates_and_numbers": {}
56
+ }
57
+ }
58
+
59
+ all_text_blocks, all_tables = [], []
60
+
61
+ with pdfplumber.open(pdf_path) as pdf:
62
+ result["document_info"]["total_pages"] = len(pdf.pages)
63
+
64
+ for page_num, page in enumerate(pdf.pages, 1):
65
+ logger.info(f"📄 Processing page {page_num}")
66
+
67
+ # Extract text with better handling
68
+ page_text = self._extract_page_text(page)
69
+ if page_text:
70
+ all_text_blocks.append({
71
+ "page": page_num,
72
+ "text": page_text,
73
+ "word_count": len(page_text.split())
74
+ })
75
+
76
+ # Extract tables with improved cleaning
77
+ tables = self._extract_page_tables(page, page_num)
78
+ all_tables.extend(tables)
79
+
80
+ result["extracted_data"]["all_text_content"] = all_text_blocks
81
+ result["extracted_data"]["all_tables"] = all_tables
82
+
83
+ # Process extracted data with improved methods
84
+ combined_text = "\n\n".join(b["text"] for b in all_text_blocks)
85
+
86
+ result["extracted_data"]["key_value_pairs"] = self._extract_key_value_pairs_improved(combined_text)
87
+ result["extracted_data"]["audit_information"] = self._extract_audit_info(combined_text, all_tables)
88
+ result["extracted_data"]["operator_information"] = self._extract_operator_info(combined_text, all_tables)
89
+ result["extracted_data"]["vehicle_registrations"] = self._extract_vehicle_registrations(all_tables)
90
+ result["extracted_data"]["driver_records"] = self._extract_driver_records(all_tables)
91
+ result["extracted_data"]["compliance_summary"] = self._extract_compliance_summary(combined_text, all_tables)
92
+ result["extracted_data"]["dates_and_numbers"] = self._extract_dates_and_numbers_improved(combined_text)
93
+
94
+ # Generate summary
95
+ result["extraction_summary"] = {
96
+ "text_blocks_found": len(all_text_blocks),
97
+ "tables_found": len(all_tables),
98
+ "key_value_pairs_found": len(result["extracted_data"]["key_value_pairs"]),
99
+ "vehicle_registrations_found": len(result["extracted_data"]["vehicle_registrations"]),
100
+ "driver_records_found": len(result["extracted_data"]["driver_records"]),
101
+ "total_characters": len(combined_text),
102
+ "processing_timestamp": datetime.now().isoformat()
103
+ }
104
+
105
+ logger.info("✅ Extraction completed!")
106
+ return result
107
+
108
+ def _extract_page_text(self, page) -> Optional[str]:
109
+ """Extract text from page with better handling"""
110
+ try:
111
+ text = page.extract_text()
112
+ if text:
113
+ # Clean up text
114
+ text = re.sub(r'[ \t]+', ' ', text.strip())
115
+ text = re.sub(r'\n\s*\n', '\n', text)
116
+ return text
117
+ except Exception as e:
118
+ logger.warning(f"Failed to extract text from page: {e}")
119
+ return None
120
+
121
+ def _extract_page_tables(self, page, page_num: int) -> List[Dict]:
122
+ """Extract tables with improved processing"""
123
+ tables = []
124
+ try:
125
+ raw_tables = page.extract_tables()
126
+ if raw_tables:
127
+ for table_idx, table in enumerate(raw_tables):
128
+ cleaned_table = self._clean_table_improved(table)
129
+ if cleaned_table and len(cleaned_table) > 0:
130
+ tables.append({
131
+ "page": page_num,
132
+ "table_index": table_idx + 1,
133
+ "headers": cleaned_table[0] if cleaned_table else [],
134
+ "data": cleaned_table[1:] if len(cleaned_table) > 1 else [],
135
+ "raw_data": cleaned_table,
136
+ "row_count": len(cleaned_table) - 1 if len(cleaned_table) > 1 else 0,
137
+ "column_count": len(cleaned_table[0]) if cleaned_table else 0
138
+ })
139
+ except Exception as e:
140
+ logger.warning(f"Failed to extract tables from page {page_num}: {e}")
141
+
142
+ return tables
143
+
144
+ def _clean_table_improved(self, table: List[List]) -> List[List[str]]:
145
+ """Improved table cleaning with better cell processing"""
146
+ if not table:
147
+ return []
148
+
149
+ cleaned = []
150
+ for row in table:
151
+ cleaned_row = []
152
+ for cell in row:
153
+ if cell is None:
154
+ cleaned_cell = ""
155
+ else:
156
+ cleaned_cell = str(cell).strip()
157
+ cleaned_cell = re.sub(r'\s+', ' ', cleaned_cell)
158
+ cleaned_cell = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', cleaned_cell)
159
+ cleaned_row.append(cleaned_cell)
160
+ if any(cell.strip() for cell in cleaned_row):
161
+ cleaned.append(cleaned_row)
162
+
163
+ # Optional: collapse single-column tables of empty strings
164
+ if cleaned and all(len(r) == len(cleaned[0]) for r in cleaned):
165
+ return cleaned
166
+ return cleaned
167
+
168
+ def _extract_key_value_pairs_improved(self, text: str) -> Dict[str, str]:
169
+ """Improved key-value pair extraction with better cleaning"""
170
+ pairs: Dict[str, str] = {}
171
+
172
+ # Normalize text a bit for regex stability
173
+ t = text.replace('\r', '\n')
174
+
175
+ # Pattern 1: colon-separated pairs (key: value)
176
+ pattern1 = re.compile(
177
+ r'([A-Za-z][\w\s()/\-.]{2,80}?):\s*([^\n\r:][^\n\r]*)'
178
+ )
179
+ for key, val in pattern1.findall(t):
180
+ k = key.strip()
181
+ v = val.strip()
182
+ # Filter junk: very long values, pure separators, or obvious headers
183
+ if not v or len(v) > 200:
184
+ continue
185
+ if re.fullmatch(r'[-_/\.]+', v):
186
+ continue
187
+ # Avoid capturing the next key as value by trimming trailing key-like tokens
188
+ v = re.sub(r'\s+[A-Z][\w\s()/\-.]{2,40}:$', '', v).strip()
189
+ # Skip values that are just long digit runs (likely id lists without meaning)
190
+ if re.fullmatch(r'\d{6,}', v):
191
+ continue
192
+ pairs[k] = v
193
+
194
+ # Pattern 2: inline “Key – Value” or “Key — Value”
195
+ pattern2 = re.compile(r'([A-Za-z][\w\s()/\-.]{2,80}?)\s*[–—-]\s*([^\n\r]+)')
196
+ for key, val in pattern2.findall(t):
197
+ k = key.strip()
198
+ v = val.strip()
199
+ if v and len(v) <= 200 and not re.fullmatch(r'\d{6,}', v):
200
+ pairs.setdefault(k, v)
201
+
202
+ return pairs
203
+
204
+ def _extract_audit_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
205
+ """Extract audit-specific information with better filtering"""
206
+ audit_info: Dict[str, Any] = {}
207
+
208
+ # Prefer tables
209
+ for table in tables:
210
+ headers = [str(h).lower() for h in table.get("headers", [])]
211
+ joined = ' '.join(headers)
212
+ if "audit information" in joined or "auditinformation" in joined:
213
+ data = table.get("data", [])
214
+ for row in data:
215
+ if len(row) >= 2 and row[0] and row[1]:
216
+ key = str(row[0]).strip()
217
+ value = str(row[1]).strip()
218
+ # Skip numbered list rows (e.g., "1.", "2)")
219
+ if re.match(r'^\s*\d+\s*[.)]\s*$', key):
220
+ continue
221
+ if key and value:
222
+ audit_info[key] = value
223
+
224
+ # Backup from text
225
+ candidates = {
226
+ "Date of Audit": r'Date\s+of\s+Audit[:\s]*([^\n\r]+)',
227
+ "Location of audit": r'Location\s+of\s+audit[:\s]*([^\n\r]+)',
228
+ "Auditor name": r'Auditor\s+name[:\s]*([^\n\r]+)',
229
+ "Audit Matrix Identifier (Name or Number)": r'Audit\s+Matrix\s+Identifier.*?[:\s]*([^\n\r]+)',
230
+ }
231
+ for k, pat in candidates.items():
232
+ if k not in audit_info:
233
+ m = re.search(pat, text, re.IGNORECASE)
234
+ if m:
235
+ audit_info[k] = m.group(1).strip()
236
+
237
+ return audit_info
238
+
239
+ def _extract_operator_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
240
+ """Extract operator information with better table parsing"""
241
+ operator_info: Dict[str, Any] = {}
242
+
243
+ # Look for operator information in tables first
244
+ for table in tables:
245
+ headers = [str(h).lower() for h in table.get("headers", [])]
246
+ if ("operatorinformation" in ' '.join(headers) or
247
+ "operator information" in ' '.join(headers) or
248
+ "operatorcontactdetails" in ' '.join(headers)):
249
+
250
+ data = table.get("data", [])
251
+ for row in data:
252
+ if len(row) >= 2 and row[0] and row[1]:
253
+ key = str(row[0]).strip()
254
+ value = str(row[1]).strip()
255
+ if key and value:
256
+ # Clean up key names
257
+ kl = key.lower()
258
+ if "operator name" in kl:
259
+ operator_info["operator_name"] = value
260
+ elif "trading name" in kl:
261
+ operator_info["trading_name"] = value
262
+ elif "company number" in kl:
263
+ if len(row) > 2:
264
+ company_parts = [str(r).strip() for r in row[1:] if str(r).strip()]
265
+ operator_info["company_number"] = "".join(company_parts)
266
+ else:
267
+ operator_info["company_number"] = value
268
+ elif "business address" in kl:
269
+ operator_info["business_address"] = value
270
+ elif "postal address" in kl:
271
+ operator_info["postal_address"] = value
272
+ elif "email" in kl:
273
+ operator_info["email"] = value
274
+ elif "telephone" in kl or "phone" in kl:
275
+ operator_info["phone"] = value
276
+ elif "nhvas accreditation" in kl:
277
+ operator_info["nhvas_accreditation"] = value
278
+ elif "nhvas manual" in kl:
279
+ operator_info["nhvas_manual"] = value
280
+
281
+ # Extract from text patterns as backup
282
+ patterns = {
283
+ 'operator_name': r'Operator\s*name[:\s\(]*([^\n\r\)]+?)(?=\s*NHVAS|\s*Registered|$)',
284
+ 'trading_name': r'Registered\s*trading\s*name[:\s\/]*([^\n\r]+?)(?=\s*Australian|$)',
285
+ 'company_number': r'Australian\s*Company\s*Number[:\s]*([0-9\s]+?)(?=\s*NHVAS|$)',
286
+ 'business_address': r'Operator\s*business\s*address[:\s]*([^\n\r]+?)(?=\s*Operator\s*Postal|$)',
287
+ 'postal_address': r'Operator\s*Postal\s*address[:\s]*([^\n\r]+?)(?=\s*Email|$)',
288
+ 'email': r'Email\s*address[:\s]*([^\s\n\r]+)',
289
+ 'phone': r'Operator\s*Telephone\s*Number[:\s]*([^\s\n\r]+)',
290
+ 'nhvas_accreditation': r'NHVAS\s*Accreditation\s*No\.[:\s\(]*([^\n\r\)]+)',
291
+ }
292
+
293
+ for key, pattern in patterns.items():
294
+ if key not in operator_info: # Only use text if not found in tables
295
+ match = re.search(pattern, text, re.IGNORECASE)
296
+ if match:
297
+ value = match.group(1).strip()
298
+ if value and len(value) < 200:
299
+ if key == 'company_number':
300
+ value = re.sub(r'\s+', '', value)
301
+ operator_info[key] = value
302
+
303
+ return operator_info
304
+
305
+ def _extract_vehicle_registrations(self, tables: List[Dict]) -> List[Dict]:
306
+ """Extract vehicle registration information from tables"""
307
+ vehicles: List[Dict[str, Any]] = []
308
+
309
+ for table in tables:
310
+ headers = [str(h).lower() for h in table.get("headers", [])]
311
+
312
+ # Look for vehicle registration tables
313
+ if any(keyword in ' '.join(headers) for keyword in ['registration', 'vehicle', 'number']):
314
+ reg_col = None
315
+ for i, header in enumerate(headers):
316
+ if 'registration' in header and 'number' in header:
317
+ reg_col = i
318
+ break
319
+
320
+ if reg_col is not None:
321
+ data = table.get("data", [])
322
+ for row in data:
323
+ if len(row) > reg_col and row[reg_col]:
324
+ reg_num = str(row[reg_col]).strip()
325
+ # Validate registration format (letters/numbers)
326
+ if re.match(r'^[A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3}$', reg_num):
327
+ vehicle_info = {"registration_number": reg_num}
328
+
329
+ # Add other columns as additional info
330
+ for i, header in enumerate(table.get("headers", [])):
331
+ if i < len(row) and i != reg_col:
332
+ vehicle_info[str(header)] = str(row[i]).strip()
333
+
334
+ vehicles.append(vehicle_info)
335
+
336
+ return vehicles
337
+
338
+ def _extract_driver_records(self, tables: List[Dict]) -> List[Dict]:
339
+ """Extract driver records from tables"""
340
+ drivers: List[Dict[str, Any]] = []
341
+
342
+ for table in tables:
343
+ headers = [str(h).lower() for h in table.get("headers", [])]
344
+
345
+ # Look for driver/scheduler tables
346
+ if any(keyword in ' '.join(headers) for keyword in ['driver', 'scheduler', 'name']):
347
+ name_col = None
348
+ for i, header in enumerate(headers):
349
+ if 'name' in header:
350
+ name_col = i
351
+ break
352
+
353
+ if name_col is not None:
354
+ data = table.get("data", [])
355
+ for row in data:
356
+ if len(row) > name_col and row[name_col]:
357
+ name = str(row[name_col]).strip()
358
+ # Basic name validation
359
+ if re.match(r'^[A-Za-z\s]{2,}$', name) and len(name.split()) >= 2:
360
+ driver_info = {"name": name}
361
+
362
+ # Add other columns
363
+ for i, header in enumerate(table.get("headers", [])):
364
+ if i < len(row) and i != name_col:
365
+ driver_info[str(header)] = str(row[i]).strip()
366
+
367
+ drivers.append(driver_info)
368
+
369
+ return drivers
370
+
371
+ def _extract_compliance_summary(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
372
+ """Extract compliance information"""
373
+ compliance = {
374
+ "standards_compliance": {},
375
+ "compliance_codes": {},
376
+ "audit_results": []
377
+ }
378
+
379
+ # Look for compliance tables
380
+ for table in tables:
381
+ headers = [str(h).lower() for h in table.get("headers", [])]
382
+
383
+ if any(keyword in ' '.join(headers) for keyword in ['compliance', 'standard', 'requirement']):
384
+ data = table.get("data", [])
385
+ for row in data:
386
+ if len(row) >= 2:
387
+ standard = str(row[0]).strip()
388
+ code = str(row[1]).strip()
389
+ if standard.startswith('Std') and code in ['V', 'NC', 'SFI', 'NAP', 'NA']:
390
+ compliance["standards_compliance"][standard] = code
391
+
392
+ # Extract compliance codes definitions
393
+ code_patterns = {
394
+ 'V': r'\bV\b\s+([^\n\r]+)',
395
+ 'NC': r'\bNC\b\s+([^\n\r]+)',
396
+ 'SFI': r'\bSFI\b\s+([^\n\r]+)',
397
+ 'NAP': r'\bNAP\b\s+([^\n\r]+)',
398
+ 'NA': r'\bNA\b\s+([^\n\r]+)',
399
+ }
400
+
401
+ for code, pattern in code_patterns.items():
402
+ match = re.search(pattern, text, re.IGNORECASE)
403
+ if match:
404
+ compliance["compliance_codes"][code] = match.group(1).strip()
405
+
406
+ return compliance
407
+
408
+ def _extract_dates_and_numbers_improved(self, text: str) -> Dict[str, Any]:
409
+ """Improved date and number extraction"""
410
+ result = {
411
+ "dates": [],
412
+ "registration_numbers": [],
413
+ "phone_numbers": [],
414
+ "email_addresses": [],
415
+ "reference_numbers": []
416
+ }
417
+
418
+ # Date patterns
419
+ date_patterns = [
420
+ r'\b(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})\b',
421
+ r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
422
+ r'\b(\d{1,2}-\d{1,2}-\d{4})\b',
423
+ r'\b(\d{1,2}\.\d{1,2}\.\d{4})\b',
424
+ ]
425
+ for pattern in date_patterns:
426
+ result["dates"].extend(re.findall(pattern, text))
427
+
428
+ # Registration numbers (Australian format-ish)
429
+ reg_pattern = r'\b([A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3})\b'
430
+ result["registration_numbers"] = list(set(re.findall(reg_pattern, text)))
431
+
432
+ # Phone numbers (AU)
433
+ phone_pattern = r'\b((?:\+61|0)[2-9]\s?\d{4}\s?\d{4})\b'
434
+ result["phone_numbers"] = list(set(re.findall(phone_pattern, text)))
435
+
436
+ # Email addresses
437
+ email_pattern = r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b'
438
+ result["email_addresses"] = list(set(re.findall(email_pattern, text)))
439
+
440
+ # Reference numbers
441
+ ref_patterns = [
442
+ (r'RF(?:S)?\s*#?\s*(\d+)', 'RFS_Certifications'),
443
+ (r'NHVAS\s+Accreditation\s+No\.?\s*(\d+)', 'NHVAS_Numbers'),
444
+ (r'Registration\s+Number\s*#?\s*(\d+)', 'Registration_Numbers'),
445
+ ]
446
+ for pattern, key in ref_patterns:
447
+ matches = re.findall(pattern, text, re.IGNORECASE)
448
+ if matches:
449
+ result["reference_numbers"].extend([f"{key}: {m}" for m in matches])
450
+
451
+ return result
452
+
453
+ @staticmethod
454
+ def save_results(results: Dict[str, Any], output_path: str):
455
+ """Save results to JSON file"""
456
+ try:
457
+ with open(output_path, 'w', encoding='utf-8') as f:
458
+ json.dump(results, f, indent=2, ensure_ascii=False)
459
+ logger.info(f"💾 Results saved to {output_path}")
460
+ except Exception as e:
461
+ logger.error(f"Failed to save results: {e}")
462
+
463
+ @staticmethod
464
+ def export_to_excel(results: Dict[str, Any], excel_path: str):
465
+ """Export results to Excel with improved formatting"""
466
+ try:
467
+ with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
468
+ # Summary sheet
469
+ summary_data = []
470
+ extraction_summary = results.get("extraction_summary", {})
471
+ for key, value in extraction_summary.items():
472
+ summary_data.append({"Metric": key.replace("_", " ").title(), "Value": value})
473
+ pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
474
+
475
+ # Key-value pairs
476
+ kv_pairs = results.get("extracted_data", {}).get("key_value_pairs", {})
477
+ if kv_pairs:
478
+ kv_df = pd.DataFrame(list(kv_pairs.items()), columns=['Key', 'Value'])
479
+ kv_df.to_excel(writer, sheet_name='Key_Value_Pairs', index=False)
480
+
481
+ # Vehicle registrations
482
+ vehicles = results.get("extracted_data", {}).get("vehicle_registrations", [])
483
+ if vehicles:
484
+ pd.DataFrame(vehicles).to_excel(writer, sheet_name='Vehicle_Registrations', index=False)
485
+
486
+ # Driver records
487
+ drivers = results.get("extracted_data", {}).get("driver_records", [])
488
+ if drivers:
489
+ pd.DataFrame(drivers).to_excel(writer, sheet_name='Driver_Records', index=False)
490
+
491
+ # Compliance summary
492
+ compliance = results.get("extracted_data", {}).get("compliance_summary", {})
493
+ if compliance.get("standards_compliance"):
494
+ comp_df = pd.DataFrame(list(compliance["standards_compliance"].items()),
495
+ columns=['Standard', 'Compliance_Code'])
496
+ comp_df.to_excel(writer, sheet_name='Compliance_Standards', index=False)
497
+
498
+ logger.info(f"📊 Results exported to Excel: {excel_path}")
499
+ except Exception as e:
500
+ logger.error(f"Failed to export to Excel: {e}")
501
+
502
+ def main():
503
+ if len(sys.argv) < 2:
504
+ print("Usage: python fixed_pdf_extractor.py <pdf_path>")
505
+ sys.exit(1)
506
+
507
+ pdf_path = Path(sys.argv[1])
508
+ if not pdf_path.exists():
509
+ print(f"❌ PDF not found: {pdf_path}")
510
+ sys.exit(1)
511
+
512
+ print("🚀 Fixed PDF Data Extractor")
513
+ print("=" * 50)
514
+
515
+ extractor = FixedPDFExtractor()
516
+ results = extractor.extract_everything(str(pdf_path))
517
+
518
+ base = pdf_path.stem
519
+ output_dir = pdf_path.parent
520
 
521
+ # Save outputs
522
+ json_path = output_dir / f"{base}_comprehensive_data.json"
523
+ excel_path = output_dir / f"{base}_fixed_extraction.xlsx"
 
 
 
 
 
 
 
 
524
 
525
+ FixedPDFExtractor.save_results(results, str(json_path))
526
+ FixedPDFExtractor.export_to_excel(results, str(excel_path))
 
 
 
 
527
 
528
+ print("\n💾 OUTPUT FILES:")
529
+ print(f" 📄 JSON Data: {json_path}")
530
+ print(f" 📊 Excel Data: {excel_path}")
531
+ print(f"\n✨ FIXED EXTRACTION COMPLETE!")
 
532
 
533
  if __name__ == "__main__":
534
+ main()
 
 
 
 
extract_red_text.py CHANGED
@@ -6,6 +6,139 @@ from docx import Document
6
  from docx.oxml.ns import qn
7
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def is_red_font(run):
10
  """Enhanced red font detection with better color checking"""
11
  col = run.font.color
@@ -76,7 +209,6 @@ def calculate_schema_match_score(schema_name, spec, context):
76
  if "Vehicle Registration" in schema_name:
77
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
78
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
79
-
80
  keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
81
  if keyword_matches >= 2:
82
  score += 150 # Very high boost for vehicle tables
@@ -157,15 +289,12 @@ def calculate_schema_match_score(schema_name, spec, context):
157
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
158
  matches = 0
159
  for lbl in labels:
160
- # More flexible matching for vehicle tables
161
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
162
  matches += 1
163
- # Also check for partial keyword matches
164
  elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
165
  matches += 0.5 # Partial credit
166
-
167
  if matches > 0:
168
- score += (matches / len(labels)) * 40 # Higher weight for row1 tables
169
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
170
 
171
  # Special handling for Declaration tables (existing logic)
@@ -187,6 +316,16 @@ def calculate_schema_match_score(schema_name, spec, context):
187
  def match_table_schema(tbl):
188
  """Improved table schema matching with scoring system"""
189
  context = get_table_context(tbl)
 
 
 
 
 
 
 
 
 
 
190
  best_match = None
191
  best_score = 0
192
  for name, spec in TABLE_SCHEMAS.items():
@@ -245,102 +384,256 @@ def extract_multi_schema_table(tbl, schemas):
245
  return result
246
 
247
  def extract_table_data(tbl, schema_name, spec):
248
- """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
249
-
250
- # 🎯 SPECIAL HANDLING for Vehicle Registration tables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  if "Vehicle Registration" in schema_name:
252
  print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
253
-
254
  labels = spec["labels"]
255
- collected = {lbl: [] for lbl in labels}
256
- seen = {lbl: set() for lbl in labels}
257
-
258
- # For Vehicle Registration, orientation is "row1" - headers in first row
 
259
  if len(tbl.rows) < 2:
260
  print(f" ❌ Vehicle table has less than 2 rows")
261
  return {}
262
-
263
- # Map header cells to labels
264
  header_row = tbl.rows[0]
265
  column_mapping = {}
266
-
267
  print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
268
-
269
  for col_idx, cell in enumerate(header_row.cells):
270
- header_text = normalize_text(cell.text).strip()
 
271
  if not header_text:
272
  continue
273
-
274
- print(f" Column {col_idx}: '{header_text}'")
275
-
276
- # Find best matching label
277
- best_match = None
278
- best_score = 0
279
-
280
- for label in labels:
281
- # Direct match
282
- if header_text.upper() == label.upper():
283
- best_match = label
284
- best_score = 1.0
285
- break
286
-
287
- # Partial keyword matching
288
- header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
289
- label_words = set(word.upper() for word in label.split() if len(word) > 2)
290
-
291
- if header_words and label_words:
292
- common_words = header_words.intersection(label_words)
293
- if common_words:
294
- score = len(common_words) / max(len(header_words), len(label_words))
295
- if score > best_score and score >= 0.4: # Lower threshold for vehicle tables
296
- best_score = score
297
- best_match = label
298
-
299
- if best_match:
300
- column_mapping[col_idx] = best_match
301
- print(f" ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
302
  else:
303
- print(f" ⚠️ No mapping found for '{header_text}'")
304
-
 
305
  print(f" 📊 Total column mappings: {len(column_mapping)}")
306
-
307
- # Extract red text from data rows (skip header)
308
  for row_idx in range(1, len(tbl.rows)):
309
  row = tbl.rows[row_idx]
310
  print(f" 📌 Processing data row {row_idx}")
311
-
312
  for col_idx, cell in enumerate(row.cells):
 
 
 
 
 
 
 
 
313
  if col_idx in column_mapping:
314
  label = column_mapping[col_idx]
315
-
316
- # Extract red text
317
- red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
318
-
319
- if red_txt:
320
- print(f" 🔴 Found red text in '{label}': '{red_txt}'")
321
-
322
- if red_txt not in seen[label]:
323
- seen[label].add(red_txt)
324
- collected[label].append(red_txt)
325
-
326
- # Return only non-empty collections
327
  result = {k: v for k, v in collected.items() if v}
 
 
328
  print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
329
  return result
330
-
331
- # 🎯 ORIGINAL CODE for all other tables (unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  labels = spec["labels"] + [schema_name]
333
  collected = {lbl: [] for lbl in labels}
334
  seen = {lbl: set() for lbl in labels}
335
- by_col = (spec["orientation"] == "row1")
336
  start_row = 1 if by_col else 0
337
  rows = tbl.rows[start_row:]
338
-
339
  for ri, row in enumerate(rows):
340
  for ci, cell in enumerate(row.cells):
341
- red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
 
 
 
 
342
  if not red_txt:
343
  continue
 
344
  if by_col:
345
  if ci < len(spec["labels"]):
346
  lbl = spec["labels"][ci]
@@ -354,17 +647,19 @@ def extract_table_data(tbl, schema_name, spec):
354
  lbl = spec_label
355
  break
356
  if not lbl:
 
357
  for spec_label in spec["labels"]:
358
- spec_norm = normalize_text(spec_label).upper()
359
- raw_norm = raw_label.upper()
360
- if spec_norm in raw_norm or raw_norm in spec_norm:
361
  lbl = spec_label
362
  break
363
  if not lbl:
364
  lbl = schema_name
 
365
  if red_txt not in seen[lbl]:
366
  seen[lbl].add(red_txt)
367
  collected[lbl].append(red_txt)
 
368
  return {k: v for k, v in collected.items() if v}
369
 
370
  def extract_red_text(input_doc):
@@ -405,6 +700,8 @@ def extract_red_text(input_doc):
405
  out[schema][k] = v
406
  else:
407
  out[schema] = data
 
 
408
  paras = {}
409
  for idx, para in enumerate(doc.paragraphs):
410
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
@@ -423,8 +720,16 @@ def extract_red_text(input_doc):
423
  if not context:
424
  context = "(para)"
425
  paras.setdefault(context, []).append(red_txt)
 
426
  if paras:
427
  out["paragraphs"] = paras
 
 
 
 
 
 
 
428
  return out
429
 
430
  def extract_red_text_filelike(input_file, output_file):
 
6
  from docx.oxml.ns import qn
7
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
9
+ def normalize_header_label(s: str) -> str:
10
+ """Normalize a header/label by stripping parentheticals & punctuation."""
11
+ s = re.sub(r"\s+", " ", s.strip())
12
+ # remove content in parentheses/brackets
13
+ s = re.sub(r"\([^)]*\)", "", s)
14
+ s = re.sub(r"\[[^]]*\]", "", s)
15
+ # unify slashes and hyphens, collapse spaces
16
+ s = s.replace("–", "-").replace("—", "-").replace("/", " / ").replace(" ", " ")
17
+ return s.strip()
18
+
19
+ # Canonical label aliases for Vehicle/Maintenance/General headers
20
+ LABEL_ALIASES = {
21
+ # Vehicle Registration (Maintenance)
22
+ "roadworthiness certificates": "Roadworthiness Certificates",
23
+ "maintenance records": "Maintenance Records",
24
+ "daily checks": "Daily Checks",
25
+ "fault recording / reporting": "Fault Recording/ Reporting",
26
+ "fault repair": "Fault Repair",
27
+
28
+ # Vehicle Registration (Mass)
29
+ "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance",
30
+ "weight verification records": "Weight Verification Records",
31
+ "rfs suspension certification #": "RFS Suspension Certification #",
32
+ "suspension system maintenance": "Suspension System Maintenance",
33
+ "trip records": "Trip Records",
34
+ "fault recording/ reporting on suspension system": "Fault Recording/ Reporting on Suspension System",
35
+
36
+ # Common
37
+ "registration number": "Registration Number",
38
+ "no.": "No.",
39
+ "sub contractor": "Sub contractor",
40
+ "sub-contractor": "Sub contractor",
41
+ }
42
+
43
+ def looks_like_operator_declaration(context):
44
+ """True iff heading says Operator Declaration and headers include Print Name + Position Title."""
45
+ heading = (context.get("heading") or "").strip().lower()
46
+ headers = " ".join(context.get("headers") or []).lower()
47
+ return (
48
+ "operator declaration" in heading
49
+ and "print name" in headers
50
+ and "position" in headers
51
+ and "title" in headers
52
+ )
53
+
54
+ def looks_like_auditor_declaration(context):
55
+ heading = (context.get("heading") or "").strip().lower()
56
+ headers = " ".join(context.get("headers") or []).lower()
57
+ return (
58
+ "auditor declaration" in heading
59
+ and "print name" in headers
60
+ and ("nhvr" in headers or "auditor registration number" in headers)
61
+ )
62
+
63
+ # --- NEW: header-only fallback that ignores headings and just keys on the two column names
64
+ def extract_operator_declaration_by_headers_from_end(doc):
65
+ """
66
+ Scan tables from the end; if a table's first row contains both
67
+ 'Print Name' AND 'Position Title' (case-insensitive), extract red text
68
+ from the data rows into:
69
+ {"Print Name": [...], "Position Title": [...]}
70
+ """
71
+ for tbl in reversed(doc.tables):
72
+ if len(tbl.rows) < 2:
73
+ continue # need header + at least one data row
74
+
75
+ headers_norm = [normalize_header_label(c.text).lower() for c in tbl.rows[0].cells]
76
+ has_print = any("print name" in h for h in headers_norm)
77
+ has_pos_tit = any(("position title" in h) or ("position" in h and "title" in h) for h in headers_norm)
78
+ if not (has_print and has_pos_tit):
79
+ continue
80
+
81
+ idx_print = next((i for i, h in enumerate(headers_norm) if "print name" in h), None)
82
+ idx_pos = next((i for i, h in enumerate(headers_norm) if "position title" in h), None)
83
+ if idx_pos is None:
84
+ idx_pos = next((i for i, h in enumerate(headers_norm) if ("position" in h and "title" in h)), None)
85
+
86
+ result = {"Print Name": [], "Position Title": []}
87
+ for row in tbl.rows[1:]:
88
+ if idx_print is not None and idx_print < len(row.cells):
89
+ cell = row.cells[idx_print]
90
+ reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
91
+ reds = coalesce_numeric_runs(reds)
92
+ txt = normalize_text(" ".join(reds))
93
+ if txt:
94
+ result["Print Name"].append(txt)
95
+
96
+ if idx_pos is not None and idx_pos < len(row.cells):
97
+ cell = row.cells[idx_pos]
98
+ reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
99
+ reds = coalesce_numeric_runs(reds)
100
+ txt = normalize_text(" ".join(reds))
101
+ if txt:
102
+ result["Position Title"].append(txt)
103
+
104
+ if result["Print Name"] or result["Position Title"]:
105
+ return {k: v for k, v in result.items() if v}
106
+
107
+ return None
108
+ # --- end NEW helper
109
+
110
+ def canonicalize_label(s: str) -> str:
111
+ key = normalize_header_label(s).lower()
112
+ key = re.sub(r"\s+", " ", key)
113
+ return LABEL_ALIASES.get(key, s)
114
+
115
+ def bag_similarity(a: str, b: str) -> float:
116
+ """Loose bag-of-words similarity for header↔label matching."""
117
+ aw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(a).lower()) if len(w) > 2 or w in {"#","no"}}
118
+ bw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(b).lower()) if len(w) > 2 or w in {"#","no"}}
119
+ if not aw or not bw:
120
+ return 0.0
121
+ inter = len(aw & bw)
122
+ return inter / max(len(aw), len(bw))
123
+
124
+ def coalesce_numeric_runs(text_list):
125
+ """
126
+ If a cell yields ['4','5','6','9','8','7','1','2','3'] etc., join continuous single-char digit runs.
127
+ Returns ['456987123'] instead of many singles. Non-digit tokens are preserved.
128
+ """
129
+ out, buf = [], []
130
+ for t in text_list:
131
+ if len(t) == 1 and t.isdigit():
132
+ buf.append(t)
133
+ else:
134
+ if buf:
135
+ out.append("".join(buf))
136
+ buf = []
137
+ out.append(t)
138
+ if buf:
139
+ out.append("".join(buf))
140
+ return out
141
+
142
  def is_red_font(run):
143
  """Enhanced red font detection with better color checking"""
144
  col = run.font.color
 
209
  if "Vehicle Registration" in schema_name:
210
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
211
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
 
212
  keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
213
  if keyword_matches >= 2:
214
  score += 150 # Very high boost for vehicle tables
 
289
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
290
  matches = 0
291
  for lbl in labels:
 
292
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
293
  matches += 1
 
294
  elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
295
  matches += 0.5 # Partial credit
 
296
  if matches > 0:
297
+ score += (matches / len(labels)) * 40
298
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
299
 
300
  # Special handling for Declaration tables (existing logic)
 
316
  def match_table_schema(tbl):
317
  """Improved table schema matching with scoring system"""
318
  context = get_table_context(tbl)
319
+ # Auditor Declaration first
320
+ if ("print name" in " ".join(context.get("headers", [])).lower() and
321
+ "auditor" in " ".join(context.get("headers", [])).lower()):
322
+ return "NHVAS Approved Auditor Declaration"
323
+ # NEW: prioritize Auditor Declaration to avoid misclassification
324
+ if looks_like_auditor_declaration(context):
325
+ return "NHVAS Approved Auditor Declaration"
326
+ # hard-match Operator Declaration first (high priority, avoids misclassification)
327
+ if looks_like_operator_declaration(context):
328
+ return "Operator Declaration"
329
  best_match = None
330
  best_score = 0
331
  for name, spec in TABLE_SCHEMAS.items():
 
384
  return result
385
 
386
  def extract_table_data(tbl, schema_name, spec):
387
+ """Extract red text data from table based on schema – per-row repeats for specific tables."""
388
+
389
+ # ───────────────────────────────────────────────────────────────────────────
390
+ # OPERATOR DECLARATION (row1 headers: Print Name | Position Title)
391
+ # ───────────────────────────────────────────────────────────────────────────
392
+ if schema_name == "Operator Declaration":
393
+ print(f" 🧾 EXTRACTION FIX: Processing Operator Declaration table")
394
+
395
+ labels = spec["labels"] # ["Print Name", "Position Title"]
396
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
397
+
398
+ collected = {lbl: [] for lbl in labels}
399
+
400
+ if len(tbl.rows) < 2:
401
+ print(f" ❌ Operator Declaration table has less than 2 rows")
402
+ return {}
403
+
404
+ # map header cells → labels (row1 orientation)
405
+ header_row = tbl.rows[0]
406
+ column_mapping = {}
407
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
408
+
409
+ for col_idx, cell in enumerate(header_row.cells):
410
+ raw_h = normalize_text(cell.text)
411
+ header_text = normalize_header_label(raw_h)
412
+ if not header_text:
413
+ continue
414
+ print(f" Column {col_idx}: '{raw_h}'")
415
+
416
+ # alias/canonical first
417
+ canon = canonicalize_label(header_text)
418
+ if canon in canonical_labels:
419
+ best_label = canonical_labels[canon]
420
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
421
+ column_mapping[col_idx] = best_label
422
+ continue
423
+
424
+ # else bag-of-words similarity
425
+ best_label, best_score = None, 0.0
426
+ for canon_lab, original_lab in canonical_labels.items():
427
+ s = bag_similarity(header_text, canon_lab)
428
+ if s > best_score:
429
+ best_score, best_label = s, original_lab
430
+
431
+ if best_label and best_score >= 0.40:
432
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
433
+ column_mapping[col_idx] = best_label
434
+ else:
435
+ print(f" ⚠️ No mapping found for '{raw_h}'")
436
+
437
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
438
+
439
+ # collect red text from the (usually single) data row
440
+ for row_idx in range(1, len(tbl.rows)):
441
+ row = tbl.rows[row_idx]
442
+ print(f" 📌 Processing data row {row_idx}")
443
+ for col_idx, cell in enumerate(row.cells):
444
+ if col_idx not in column_mapping:
445
+ continue
446
+ label = column_mapping[col_idx]
447
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
448
+ if not reds:
449
+ continue
450
+ reds = coalesce_numeric_runs(reds)
451
+ red_txt = normalize_text(" ".join(reds))
452
+ if not red_txt:
453
+ continue
454
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
455
+ collected[label].append(red_txt)
456
+
457
+ result = {k: v for k, v in collected.items() if v}
458
+ print(f" ✅ Operator Declaration extracted: {len(result)} columns with data")
459
+ return result
460
+
461
+ # ───────────────────────────────────────────────────────────────────────────
462
+ # A) Vehicle Registration tables (per-row accumulation; NO dedupe)
463
+ # ───────────────────────────────────────────────────────────────────────────
464
  if "Vehicle Registration" in schema_name:
465
  print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
466
+
467
  labels = spec["labels"]
468
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
469
+
470
+ collected = {lbl: [] for lbl in labels} # ← keep every row value
471
+ unmapped_bucket = {}
472
+
473
  if len(tbl.rows) < 2:
474
  print(f" ❌ Vehicle table has less than 2 rows")
475
  return {}
476
+
 
477
  header_row = tbl.rows[0]
478
  column_mapping = {}
 
479
  print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
480
+
481
  for col_idx, cell in enumerate(header_row.cells):
482
+ raw_h = normalize_text(cell.text)
483
+ header_text = normalize_header_label(raw_h)
484
  if not header_text:
485
  continue
486
+ print(f" Column {col_idx}: '{raw_h}'")
487
+
488
+ # Try alias/canonical first
489
+ canon = canonicalize_label(header_text)
490
+ if canon in canonical_labels:
491
+ best_label = canonical_labels[canon]
492
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
493
+ column_mapping[col_idx] = best_label
494
+ continue
495
+
496
+ # Else bag-of-words similarity
497
+ best_label, best_score = None, 0.0
498
+ for canon_lab, original_lab in canonical_labels.items():
499
+ s = bag_similarity(header_text, canon_lab)
500
+ if s > best_score:
501
+ best_score, best_label = s, original_lab
502
+
503
+ if best_label and best_score >= 0.40:
504
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
505
+ column_mapping[col_idx] = best_label
 
 
 
 
 
 
 
 
 
506
  else:
507
+ print(f" ⚠️ No mapping found for '{raw_h}'")
508
+ unmapped_bucket[raw_h] = []
509
+
510
  print(f" 📊 Total column mappings: {len(column_mapping)}")
511
+
512
+ header_texts = [normalize_text(hc.text) for hc in header_row.cells]
513
  for row_idx in range(1, len(tbl.rows)):
514
  row = tbl.rows[row_idx]
515
  print(f" 📌 Processing data row {row_idx}")
 
516
  for col_idx, cell in enumerate(row.cells):
517
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
518
+ if not reds:
519
+ continue
520
+ reds = coalesce_numeric_runs(reds)
521
+ red_txt = normalize_text(" ".join(reds))
522
+ if not red_txt:
523
+ continue
524
+
525
  if col_idx in column_mapping:
526
  label = column_mapping[col_idx]
527
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
528
+ collected[label].append(red_txt) # append every occurrence
529
+ else:
530
+ header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
531
+ unmapped_bucket.setdefault(header_name, []).append(red_txt)
532
+
 
 
 
 
 
 
533
  result = {k: v for k, v in collected.items() if v}
534
+ if unmapped_bucket:
535
+ result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
536
  print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
537
  return result
538
+
539
+ # ───────────────────────────────────────────────────────────────────────────
540
+ # B) Driver / Scheduler Records Examined (per-row accumulation; NO dedupe)
541
+ # ───────────────────────────────────────────────────────────────────────────
542
+ if "Driver / Scheduler" in schema_name:
543
+ print(f" 👤 EXTRACTION FIX: Processing Driver / Scheduler table")
544
+
545
+ labels = spec["labels"]
546
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
547
+
548
+ collected = {lbl: [] for lbl in labels} # ← keep every row value
549
+ unmapped_bucket = {}
550
+
551
+ if len(tbl.rows) < 2:
552
+ print(f" ❌ Driver/Scheduler table has less than 2 rows")
553
+ return {}
554
+
555
+ header_row = tbl.rows[0]
556
+ column_mapping = {}
557
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
558
+
559
+ for col_idx, cell in enumerate(header_row.cells):
560
+ raw_h = normalize_text(cell.text)
561
+ header_text = normalize_header_label(raw_h)
562
+ if not header_text:
563
+ continue
564
+ print(f" Column {col_idx}: '{raw_h}'")
565
+
566
+ # Try alias/canonical first (rarely used here, but safe)
567
+ canon = canonicalize_label(header_text)
568
+ if canon in canonical_labels:
569
+ best_label = canonical_labels[canon]
570
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
571
+ column_mapping[col_idx] = best_label
572
+ continue
573
+
574
+ # Else bag-of-words similarity (good for long headings)
575
+ best_label, best_score = None, 0.0
576
+ for canon_lab, original_lab in canonical_labels.items():
577
+ s = bag_similarity(header_text, canon_lab)
578
+ if s > best_score:
579
+ best_score, best_label = s, original_lab
580
+
581
+ if best_label and best_score >= 0.40:
582
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
583
+ column_mapping[col_idx] = best_label
584
+ else:
585
+ print(f" ⚠️ No mapping found for '{raw_h}'")
586
+ unmapped_bucket[raw_h] = []
587
+
588
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
589
+
590
+ header_texts = [normalize_text(hc.text) for hc in header_row.cells]
591
+ for row_idx in range(1, len(tbl.rows)):
592
+ row = tbl.rows[row_idx]
593
+ print(f" 📌 Processing data row {row_idx}")
594
+ for col_idx, cell in enumerate(row.cells):
595
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
596
+ if not reds:
597
+ continue
598
+ reds = coalesce_numeric_runs(reds)
599
+ red_txt = normalize_text(" ".join(reds))
600
+ if not red_txt:
601
+ continue
602
+
603
+ if col_idx in column_mapping:
604
+ label = column_mapping[col_idx]
605
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
606
+ collected[label].append(red_txt) # ← append every occurrence
607
+ else:
608
+ header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
609
+ unmapped_bucket.setdefault(header_name, []).append(red_txt)
610
+
611
+ result = {k: v for k, v in collected.items() if v}
612
+ if unmapped_bucket:
613
+ result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
614
+ print(f" ✅ Driver / Scheduler extracted: {len(result)} columns with data")
615
+ return result
616
+
617
+ # ───────────────────────────────────────────────────────────────────────────
618
+ # C) Generic tables (unchanged: WITH dedupe)
619
+ # ───────────────────────────────────────────────────────────────────────────
620
  labels = spec["labels"] + [schema_name]
621
  collected = {lbl: [] for lbl in labels}
622
  seen = {lbl: set() for lbl in labels}
623
+ by_col = (spec.get("orientation") == "row1")
624
  start_row = 1 if by_col else 0
625
  rows = tbl.rows[start_row:]
626
+
627
  for ri, row in enumerate(rows):
628
  for ci, cell in enumerate(row.cells):
629
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
630
+ if not reds:
631
+ continue
632
+ reds = coalesce_numeric_runs(reds)
633
+ red_txt = normalize_text(" ".join(reds))
634
  if not red_txt:
635
  continue
636
+
637
  if by_col:
638
  if ci < len(spec["labels"]):
639
  lbl = spec["labels"][ci]
 
647
  lbl = spec_label
648
  break
649
  if not lbl:
650
+ a_raw = normalize_header_label(raw_label).upper()
651
  for spec_label in spec["labels"]:
652
+ a_spec = normalize_header_label(spec_label).upper()
653
+ if a_spec in a_raw or a_raw in a_spec:
 
654
  lbl = spec_label
655
  break
656
  if not lbl:
657
  lbl = schema_name
658
+
659
  if red_txt not in seen[lbl]:
660
  seen[lbl].add(red_txt)
661
  collected[lbl].append(red_txt)
662
+
663
  return {k: v for k, v in collected.items() if v}
664
 
665
  def extract_red_text(input_doc):
 
700
  out[schema][k] = v
701
  else:
702
  out[schema] = data
703
+
704
+ # paragraphs (FIX: do not return early; build full 'paras' then attach)
705
  paras = {}
706
  for idx, para in enumerate(doc.paragraphs):
707
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
 
720
  if not context:
721
  context = "(para)"
722
  paras.setdefault(context, []).append(red_txt)
723
+
724
  if paras:
725
  out["paragraphs"] = paras
726
+
727
+ # Fallback: ensure we capture the last-page Operator Declaration by headers
728
+ if "Operator Declaration" not in out:
729
+ op_dec = extract_operator_declaration_by_headers_from_end(doc)
730
+ if op_dec:
731
+ out["Operator Declaration"] = op_dec
732
+
733
  return out
734
 
735
  def extract_red_text_filelike(input_file, output_file):
fine_tuning_lightgbm_models.ipynb ADDED
@@ -0,0 +1,961 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d15dad13-9732-4e4c-bbd1-1a33545a4293",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Overview"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "6f857fc7-d7fb-4b05-a242-de31fb1f086d",
14
+ "metadata": {},
15
+ "source": [
16
+ "In this notebook, we'll go through the process of fine-tuning the LightGBM models in the `pdf-document-layout-analysis` service."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "id": "0c96b645-eef0-47a2-8c4f-284cdc05e76d",
22
+ "metadata": {},
23
+ "source": [
24
+ "But before doing that, let's start with some basic concepts and introduce modules and methods to make the process easier and cleaner."
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "markdown",
29
+ "id": "f1e5c19b-1920-4f2c-9994-943626cd8a58",
30
+ "metadata": {},
31
+ "source": [
32
+ "To begin with, you should first ensure that `Poppler` is installed on your system. We will use it to process PDFs:"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 1,
38
+ "id": "5f198930-caf1-4cb4-bb1e-8ca063ad8587",
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "pdftohtml is already installed.\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "%%bash\n",
51
+ "\n",
52
+ "if ! command -v pdftohtml &> /dev/null\n",
53
+ "then\n",
54
+ " echo \"pdftohtml is not installed. Installing now...\"\n",
55
+ " sudo apt install pdftohtml\n",
56
+ "else\n",
57
+ " echo \"pdftohtml is already installed.\"\n",
58
+ "fi"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "id": "5d971faa-e9a8-47d6-8c02-66be6f3a3c6c",
64
+ "metadata": {},
65
+ "source": [
66
+ "We use Poppler to convert PDFs to XMLs. To work with Poppler in Python, we have created `PdfFeatures` module, which can be found in `pdf_features/PdfFeatures.py`."
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "id": "f7ac5d42-fb70-4476-8e05-b159f18ae3dd",
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "from pdf_features.PdfFeatures import PdfFeatures"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "e45522eb-6879-472a-a822-64b38041ccc3",
82
+ "metadata": {},
83
+ "source": [
84
+ "To open a PDF file with PdfFeatures module, simply write:"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 3,
90
+ "id": "e4ac53e5-b249-4dcd-beeb-e3009e17079b",
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "Page-1\n",
98
+ "Page-2\n"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "id": "2c7c6241-9016-4416-a53e-644145f9063a",
109
+ "metadata": {},
110
+ "source": [
111
+ "When you open `pdf_features` like this, the XML file is saved in a temporary path and handled on the fly.\n",
112
+ "\n",
113
+ "If you want to save the XML file, you should provide a path where it can be saved:"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 4,
119
+ "id": "eb1056ee-2e45-4b12-b2bc-8d23553c2143",
120
+ "metadata": {},
121
+ "outputs": [
122
+ {
123
+ "name": "stdout",
124
+ "output_type": "stream",
125
+ "text": [
126
+ "Page-1\n",
127
+ "Page-2\n"
128
+ ]
129
+ }
130
+ ],
131
+ "source": [
132
+ "pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\", \"test_pdfs/regular.xml\")"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "markdown",
137
+ "id": "703ec555-c3a5-4e7e-a6dd-886be67cb6de",
138
+ "metadata": {},
139
+ "source": [
140
+ "Here is a part of the XML to illustrate what it looks like:"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "markdown",
145
+ "id": "5b6fcebd-f91b-43fe-b2d6-b9956c3fd173",
146
+ "metadata": {},
147
+ "source": [
148
+ "```\n",
149
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
150
+ "<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n",
151
+ "\n",
152
+ "<pdf2xml producer=\"poppler\" version=\"23.04.0\">\n",
153
+ "<page number=\"1\" position=\"absolute\" top=\"0\" left=\"0\" height=\"842\" width=\"595\">\n",
154
+ "\t<fontspec id=\"0\" size=\"10\" family=\"JOIBEJ+Verdana\" color=\"#000000\"/>\n",
155
+ "\t<fontspec id=\"1\" size=\"10\" family=\"JOIBGK+Verdana\" color=\"#000000\"/>\n",
156
+ "<text top=\"106\" left=\"244\" width=\"111\" height=\"12\" font=\"0\"><b>RESOLUCIÓN DE LA </b></text>\n",
157
+ "<text top=\"118\" left=\"157\" width=\"284\" height=\"12\" font=\"0\"><b>CORTE INTERAMERICANA DE DERECHOS HUMANOS </b></text>\n",
158
+ "<text top=\"129\" left=\"227\" width=\"145\" height=\"12\" font=\"0\"><b>DEL 29 DE JULIO DE 1991 </b></text>\n",
159
+ "<text top=\"141\" left=\"298\" width=\"3\" height=\"12\" font=\"0\"><b> </b></text>\n",
160
+ "<text top=\"153\" left=\"298\" width=\"3\" height=\"12\" font=\"0\"><b> </b></text>\n",
161
+ "<text top=\"165\" left=\"132\" width=\"334\" height=\"12\" font=\"0\"><b>MEDIDAS PROVISIONALES SOLICITADAS POR LA COMISIÓN </b></text>\n",
162
+ "<text top=\"177\" left=\"177\" width=\"245\" height=\"12\" font=\"0\"><b>INTERAMERICANA DE DERECHOS HUMANOS </b></text>\n",
163
+ "<text top=\"188\" left=\"225\" width=\"149\" height=\"12\" font=\"0\"><b>RESPECTO DE GUATEMALA </b></text>\n",
164
+ "\n",
165
+ "...\n",
166
+ "```"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "id": "4be01120-c4ce-4e09-bc10-64b1742c9b0b",
172
+ "metadata": {},
173
+ "source": [
174
+ "When we convert PDFs to XMLs with Poppler, it creates `tokens`. These tokens are generally lines of text, but they can vary according to Poppler's heuristics and what has been extracted. \n",
175
+ "A token can be a single character, empty string, or an entire line. Every `<text>` item you see above is a `token`."
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "markdown",
180
+ "id": "00517165-bc84-4a6f-9a8b-91084cc603ab",
181
+ "metadata": {},
182
+ "source": [
183
+ "The PdfFeatures module provides basic capabilities for working with PDF files. Here are some features of this module. \n",
184
+ "You don't have to memorize them, but they can be useful for future reference:\n",
185
+ "\n",
186
+ "- Every PdfFeatures instance has `pages` attribute. This attribute includes a list of `PdfPage` elements to work with each of the pages.\n",
187
+ "- Every PdfPage element has attributes like `page_number`, `page_width`, `page_height` and `tokens`.\n",
188
+ "- The `tokens` attribute includes a list of `PdfToken` elements to work with each of the tokens within that page.\n",
189
+ "- Every PdfToken element has attributes like `content`, `bounding_box`, `token_type`, `page_number`.\n",
190
+ "- The `content` attribute is, as the name implies, the string content of the given token.\n",
191
+ "- The`bounding_box` attribute stores the position of the given token on the page.\n",
192
+ "- `bounding_box` is a `Rectangle` element. For example, if you want to get the left coordinate of the token, you can do so by typing `token.bounding_box.left`. It will return an integer value.\n",
193
+ "- `token_type` attribute is for keeping the type of the token. It's a `TokenType` element and you'll see more details about this one in the next sections.\n",
194
+ "- Like PdfPage items, tokens also have a `page_number` attribute to indicate which page they are on. This is useful in some scenarios."
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "markdown",
199
+ "id": "63a71904-0ad3-4fca-830a-402d9334614a",
200
+ "metadata": {},
201
+ "source": [
202
+ "If you want to loop through the tokens of a file and check their contents you can use something like this:"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 5,
208
+ "id": "444d3778-c3f5-48fd-aa20-cfe1bf851aad",
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "\u001B[96mRESOLUCIÓN DE LA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [244, 106, 355, 118]]\u001B[0m\n",
216
+ "\u001B[96mCORTE INTERAMERICANA DE DERECHOS HUMANOS\u001B[0m \u001B[93m[Page: 1 || Coordinates: [157, 118, 441, 130]]\u001B[0m\n",
217
+ "\u001B[96mDEL 29 DE JULIO DE 1991\u001B[0m \u001B[93m[Page: 1 || Coordinates: [227, 129, 372, 141]]\u001B[0m\n",
218
+ "\u001B[96mMEDIDAS PROVISIONALES SOLICITADAS POR LA COMISIÓN\u001B[0m \u001B[93m[Page: 1 || Coordinates: [132, 165, 466, 177]]\u001B[0m\n",
219
+ "\u001B[96mINTERAMERICANA DE DERECHOS HUMANOS\u001B[0m \u001B[93m[Page: 1 || Coordinates: [177, 177, 422, 189]]\u001B[0m\n",
220
+ "\u001B[96mRESPECTO DE GUATEMALA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [225, 188, 374, 200]]\u001B[0m\n",
221
+ "\u001B[96mCASO CHUNIMA\u001B[0m \u001B[93m[Page: 1 || Coordinates: [254, 224, 344, 236]]\u001B[0m\n",
222
+ "\u001B[96mLA CORTE INTERAMERICANA DE DERECHOS HUMANOS,\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 259, 393, 271]]\u001B[0m\n",
223
+ "\u001B[96mVISTOS:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 295, 137, 307]]\u001B[0m\n",
224
+ "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 318, 101, 330]]\u001B[0m\n",
225
+ "\u001B[96mLa resolución del Presidente de la Corte Interamericana de Derechos Humanos\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 318, 511, 330]]\u001B[0m\n",
226
+ "\u001B[96mde 15 de julio de 1991, sobre medidas provisionales solicitadas por la Comisión\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 330, 514, 342]]\u001B[0m\n",
227
+ "\u001B[96mInteramericana de Derechos Humanos respecto de Guatemala;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 342, 401, 354]]\u001B[0m\n",
228
+ "\u001B[96m2.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 366, 102, 378]]\u001B[0m\n",
229
+ "\u001B[96mLa convocatoria a una audiencia pública para el día 29 de julio de 1991 a las\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 366, 512, 378]]\u001B[0m\n",
230
+ "\u001B[96m3:00 p.m., contenida en la resolución citada;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 378, 312, 390]]\u001B[0m\n",
231
+ "\u001B[96m3.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 401, 104, 413]]\u001B[0m\n",
232
+ "\u001B[96mLos escritos de fechas 24 y 26 de este mes de julio presentados por el\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 401, 514, 413]]\u001B[0m\n",
233
+ "\u001B[96mGobierno de Guatemala en los cuales informa que, en atención a la resolución del\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 413, 513, 425]]\u001B[0m\n",
234
+ "\u001B[96mPresidente, ha tomado medidas dirigidas a la protección de las personas\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 425, 518, 437]]\u001B[0m\n",
235
+ "\u001B[96mmencionadas en esa resolución y solicita un aplazamiento de por lo menos 30 días de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 437, 512, 449]]\u001B[0m\n",
236
+ "\u001B[96mla audiencia convocada por el Presidente para hoy, a fin de contar con un plazo que\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 448, 512, 460]]\u001B[0m\n",
237
+ "\u001B[96mle permita hacer una presentación adecuada ante la Corte.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 460, 380, 472]]\u001B[0m\n",
238
+ "\u001B[96mCONSIDERANDO:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 484, 189, 496]]\u001B[0m\n",
239
+ "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 508, 101, 520]]\u001B[0m\n",
240
+ "\u001B[96mQue, en virtud del artículo 23.4 de su Reglamento, la Corte Interamericana de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 508, 511, 520]]\u001B[0m\n",
241
+ "\u001B[96mDerechos Humanos debe pronunciarse sobre la resolución del Presidente del 15 de\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 519, 513, 531]]\u001B[0m\n",
242
+ "\u001B[96mjulio de 1991;\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 531, 160, 543]]\u001B[0m\n",
243
+ "\u001B[96m2.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 555, 104, 567]]\u001B[0m\n",
244
+ "\u001B[96mQue, habida cuenta de que la Corte se encuentra reunida, debe también\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 555, 514, 567]]\u001B[0m\n",
245
+ "\u001B[96mdecidir sobre la petición de aplazamiento de la audiencia sobre medidas provisionales\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 567, 512, 579]]\u001B[0m\n",
246
+ "\u001B[96mformuladas por el Gobierno de Guatemala.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 578, 300, 590]]\u001B[0m\n",
247
+ "\u001B[96mPOR TANTO:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 602, 159, 614]]\u001B[0m\n",
248
+ "\u001B[96mLA CORTE INTERAMERICANA DE DERECHOS HUMANOS,\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 626, 393, 638]]\u001B[0m\n",
249
+ "\u001B[96mRESUELVE:\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 649, 151, 661]]\u001B[0m\n",
250
+ "\u001B[96m1.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 673, 103, 685]]\u001B[0m\n",
251
+ "\u001B[96mConvocar a una audiencia pública para el 30 de julio de 1991 a las 15:00\u001B[0m \u001B[93m[Page: 1 || Coordinates: [122, 673, 513, 685]]\u001B[0m\n",
252
+ "\u001B[96mhoras con el objeto de conocer los puntos de vista del Gobierno de Guatemala y de la\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 685, 512, 697]]\u001B[0m\n",
253
+ "\u001B[96mComisión sobre la solicitud de prórroga formulada por el primero.\u001B[0m \u001B[93m[Page: 1 || Coordinates: [88, 697, 412, 709]]\u001B[0m\n",
254
+ "\u001B[96m2\u001B[0m \u001B[93m[Page: 2 || Coordinates: [294, 71, 300, 83]]\u001B[0m\n",
255
+ "\u001B[96m2.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 106, 101, 118]]\u001B[0m\n",
256
+ "\u001B[96mConocer también, en dicha audiencia pública, de las medidas que, en atención\u001B[0m \u001B[93m[Page: 2 || Coordinates: [122, 106, 511, 118]]\u001B[0m\n",
257
+ "\u001B[96ma la resolución del Presidente del 15 de julio del presente año, ha tomado el\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 118, 515, 130]]\u001B[0m\n",
258
+ "\u001B[96mGobierno de Guatemala.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 129, 211, 141]]\u001B[0m\n",
259
+ "\u001B[96m3.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 153, 103, 165]]\u001B[0m\n",
260
+ "\u001B[96mReservarse el derecho de convocar a una audiencia pública para resolver la\u001B[0m \u001B[93m[Page: 2 || Coordinates: [122, 153, 513, 165]]\u001B[0m\n",
261
+ "\u001B[96mpetición de la Comisión sobre medidas provisionales respecto de Guatemala.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 165, 467, 177]]\u001B[0m\n",
262
+ "\u001B[96mHéctor Fix-Zamudio\u001B[0m \u001B[93m[Page: 2 || Coordinates: [249, 200, 349, 212]]\u001B[0m\n",
263
+ "\u001B[96mPresidente\u001B[0m \u001B[93m[Page: 2 || Coordinates: [272, 212, 327, 224]]\u001B[0m\n",
264
+ "\u001B[96mOrlando\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 248, 161, 260]]\u001B[0m\n",
265
+ "\u001B[96mTovar\u001B[0m \u001B[93m[Page: 2 || Coordinates: [129, 248, 191, 260]]\u001B[0m\n",
266
+ "\u001B[96mTamayo\u001B[0m \u001B[93m[Page: 2 || Coordinates: [161, 248, 234, 260]]\u001B[0m\n",
267
+ "\u001B[96mThomas\u001B[0m \u001B[93m[Page: 2 || Coordinates: [225, 248, 436, 260]]\u001B[0m\n",
268
+ "\u001B[96mBuergenthal\u001B[0m \u001B[93m[Page: 2 || Coordinates: [405, 248, 499, 260]]\u001B[0m\n",
269
+ "\u001B[96mRafael Nieto Navia\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 283, 195, 295]]\u001B[0m\n",
270
+ "\u001B[96mPolicarpo Callejas Bonilla\u001B[0m \u001B[93m[Page: 2 || Coordinates: [329, 283, 481, 295]]\u001B[0m\n",
271
+ "\u001B[96mSonia\u001B[0m \u001B[93m[Page: 2 || Coordinates: [88, 318, 150, 330]]\u001B[0m\n",
272
+ "\u001B[96mPicado\u001B[0m \u001B[93m[Page: 2 || Coordinates: [118, 318, 184, 330]]\u001B[0m\n",
273
+ "\u001B[96mSotela\u001B[0m \u001B[93m[Page: 2 || Coordinates: [153, 318, 218, 330]]\u001B[0m\n",
274
+ "\u001B[96mJulio\u001B[0m \u001B[93m[Page: 2 || Coordinates: [191, 318, 419, 330]]\u001B[0m\n",
275
+ "\u001B[96mA.\u001B[0m \u001B[93m[Page: 2 || Coordinates: [388, 318, 433, 330]]\u001B[0m\n",
276
+ "\u001B[96mBarberis\u001B[0m \u001B[93m[Page: 2 || Coordinates: [402, 318, 477, 330]]\u001B[0m\n",
277
+ "\u001B[96mManuel E. Ventura Robles\u001B[0m \u001B[93m[Page: 2 || Coordinates: [235, 354, 364, 366]]\u001B[0m\n",
278
+ "\u001B[96mSecretario\u001B[0m \u001B[93m[Page: 2 || Coordinates: [273, 366, 326, 378]]\u001B[0m\n"
279
+ ]
280
+ }
281
+ ],
282
+ "source": [
283
+ "for page in pdf_features.pages:\n",
284
+ " for token in page.tokens:\n",
285
+ " coordinates = [token.bounding_box.left, token.bounding_box.top, token.bounding_box.right, token.bounding_box.bottom]\n",
286
+ " print(f\"\\033[96m{token.content}\\033[0m \\033[93m[Page: {page.page_number} || Coordinates: {coordinates}]\\033[0m\")"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "markdown",
291
+ "id": "4576ff4d-92fc-4e19-a947-ebfb3fd01060",
292
+ "metadata": {},
293
+ "source": [
294
+ "## Fine-Tuning Models"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "id": "01826a89-25c9-4385-a1e6-b65c0edbd0c6",
300
+ "metadata": {},
301
+ "source": [
302
+ "Now that we have some overview about the `PdfFeatures` module, we can now start fine-tuning process."
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "markdown",
307
+ "id": "586eba43-9138-4eff-a3fa-24553de04e82",
308
+ "metadata": {},
309
+ "source": [
310
+ "In the `pdf-document-layout-analysis` service, there are two LightGBM (i.e. fast) models.\n",
311
+ "\n",
312
+ "- The first model is used to determine the types of tokens. We call it `token_type_model`.\n",
313
+ "- The second model is used to identify the segments to which the tokens belong. We call this model `paragraph_extraction_model`.\n",
314
+ "\n",
315
+ "The second model uses the predictions from the first model's output (predicted token types) as part of its features. So, let's start by fine-tuning the token type model."
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "markdown",
320
+ "id": "c326ccb1-a36b-40f9-b7e2-e83ba3c0e12b",
321
+ "metadata": {},
322
+ "source": [
323
+ "### Fine-Tuning Token Type Model"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "markdown",
328
+ "id": "7b3638eb-c512-4bd5-97f4-4df3ae984978",
329
+ "metadata": {},
330
+ "source": [
331
+ "#### Loading Data"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "markdown",
336
+ "id": "ab35b27c-8464-470c-9ef1-a9aef8945f6a",
337
+ "metadata": {},
338
+ "source": [
339
+ "To properly train a token type model, you should have a list of PdfFeatures items where the `token_type` attribute of their tokens is set correctly, as this attribute will be used as the label.\n",
340
+ "\n",
341
+ "To see what labels are going to be used in the model, you can check `pdf_token_type_labels/TokenType.py`. As default, we are using the labels of [DocLayNet](https://github.com/DS4SD/DocLayNet) dataset."
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 6,
347
+ "id": "2ab3093c-6e67-4505-bac3-b7db73ef5372",
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": [
351
+ "def get_pdf_features_labels() -> PdfFeatures:\n",
352
+ " # Assuming that you are loading your own labels in this part.\n",
353
+ " # I'm just going to put a list with a single file for demonstration.\n",
354
+ " pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
355
+ " labeled_pdf_features_list: list[PdfFeatures] = [pdf_features]\n",
356
+ " return labeled_pdf_features_list\n",
357
+ "\n",
358
+ "def train_token_type_model():\n",
359
+ " model_configuration = ModelConfiguration()\n",
360
+ " labeled_pdf_features_list: list[PdfFeatures] = get_pdf_features_labels()\n",
361
+ " trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
362
+ " train_labels = [token.token_type.get_index() for token in trainer.loop_tokens()]\n",
363
+ " trainer.train(\"models/token_type_example_model.model\", train_labels) \n",
364
+ "\n",
365
+ "train_token_type_model()"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "id": "32db8aee-9d2c-45bf-b7af-ac6249081f32",
371
+ "metadata": {},
372
+ "source": "Don't forget to check what's inside the `model_configuration`. You might need to tune the hyperparameters."
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "id": "fda0c166-ac25-4084-974a-c73f1cb06f18",
377
+ "metadata": {},
378
+ "source": "If you want to use our trained models as base and refit with your own data, you can use this function:"
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "id": "5acf2beb-f7a2-4e12-8f11-4bffff7efa74",
383
+ "metadata": {},
384
+ "source": [
385
+ "def refit_token_type_model():\n",
386
+ " model_configuration = ModelConfiguration()\n",
387
+ " model_configuration.resume_training = True\n",
388
+ " labeled_pdf_features_list: list[PdfFeatures] = get_pdf_features_labels()\n",
389
+ " trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
390
+ " train_labels = [token.token_type.get_index() for token in trainer.loop_tokens()]\n",
391
+ " trainer.train(\"models/token_type_lightgbm.model\", train_labels)\n"
392
+ ],
393
+ "outputs": [],
394
+ "execution_count": null
395
+ },
396
+ {
397
+ "cell_type": "markdown",
398
+ "id": "7c50cbae-9841-4289-9097-7357a0c724a7",
399
+ "metadata": {},
400
+ "source": "Running this function will refit the same model with your data. Depending on your situation, it may or may not help you."
401
+ },
402
+ {
403
+ "cell_type": "markdown",
404
+ "id": "19abde59-7ba5-4e65-8ce7-6bb7fb2202d5",
405
+ "metadata": {},
406
+ "source": [
407
+ "If it does not help, you can try to check other fine-tuning strategies in LightGBM. \n",
408
+ "\n",
409
+ "In that case, all you need to do is changing this part in `pdf_tokens_type_trainer/PdfTrainer.py` (lines 47-49):\n",
410
+ "\n",
411
+ "```\n",
412
+ " if self.model_configuration.resume_training and exists(model_path):\n",
413
+ " model = lgb.Booster(model_file=model_path)\n",
414
+ " gbm = model.refit(x_train, labels)\n",
415
+ "```"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "markdown",
420
+ "id": "5379e82a-9fa7-4fea-9d6b-a11e672707bc",
421
+ "metadata": {},
422
+ "source": "To make predictions with the trained model, you can use this function:"
423
+ },
424
+ {
425
+ "cell_type": "code",
426
+ "id": "f5b7f4fb-7052-4e8c-856a-6b1d83e5ece4",
427
+ "metadata": {},
428
+ "source": [
429
+ "def get_predictions():\n",
430
+ " model_configuration = ModelConfiguration()\n",
431
+ " pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
432
+ " trainer = TokenTypeTrainer([pdf_features], model_configuration)\n",
433
+ " trainer.set_token_types()\n",
434
+ " for token in pdf_features.pages[0].tokens[:20]:\n",
435
+ " print(f\"\\033[96m{token.content}\\033[0m \\033[93m[{token.token_type}]\\033[0m\")\n",
436
+ "\n",
437
+ "get_predictions() "
438
+ ],
439
+ "outputs": [],
440
+ "execution_count": null
441
+ },
442
+ {
443
+ "cell_type": "markdown",
444
+ "id": "e6808202-892d-43e0-9e7a-73ebc347901f",
445
+ "metadata": {},
446
+ "source": "### Fine-Tuning Paragraph Extraction Model"
447
+ },
448
+ {
449
+ "cell_type": "markdown",
450
+ "id": "0b31a859-7867-4bd0-be13-7ae4ff4c8a61",
451
+ "metadata": {},
452
+ "source": "#### Loading Data"
453
+ },
454
+ {
455
+ "cell_type": "markdown",
456
+ "id": "2778fd0b-5351-4c83-a15a-ecf8aac91397",
457
+ "metadata": {},
458
+ "source": "The second model in our pipeline is the paragraph extraction model. After finding the type of each token, now, we are going to \"group\" the tokens, which means, we are going to find each token's segment."
459
+ },
460
+ {
461
+ "cell_type": "markdown",
462
+ "id": "8112645a-816d-4579-b6e9-14b505703fc9",
463
+ "metadata": {},
464
+ "source": "We are going to explain the process but for this part, we highly recommend you to place your labeled data as in this following file structure and use the already existing methods. Otherwise, it can be a bit more harder for you to use our modules:"
465
+ },
466
+ {
467
+ "cell_type": "markdown",
468
+ "id": "b96c7988-cd1e-492a-9990-84db9f7111d2",
469
+ "metadata": {},
470
+ "source": [
471
+ "```\n",
472
+ ".\n",
473
+ "└── pdf-labeled-data\n",
474
+ " ├── labeled_data\n",
475
+ " │ ├── token_type\n",
476
+ " │ │ ├── train_data\n",
477
+ " │ │ │ ├── example_document1\n",
478
+ " │ │ │ │ └── labels.json\n",
479
+ " │ │ │ ├── example_document2\n",
480
+ " │ │ │ │ └── labels.json\n",
481
+ " │ │ │ └── example_document3\n",
482
+ " │ │ │ └── labels.json\n",
483
+ " │ │ └── test_data\n",
484
+ " │ │ └── example_document4\n",
485
+ " │ │ └── labels.json\n",
486
+ " │ └── paragraph_extraction\n",
487
+ " │ ├── train_data\n",
488
+ " │ │ ├── example_document1\n",
489
+ " │ │ │ └── labels.json\n",
490
+ " │ │ ├── example_document2\n",
491
+ " │ │ │ └── labels.json\n",
492
+ " │ │ └── example_document3\n",
493
+ " │ │ └── labels.json\n",
494
+ " │ └── test_data\n",
495
+ " │ └── example_document4\n",
496
+ " │ └── labels.json\n",
497
+ " └── pdfs\n",
498
+ " ├── example_document1\n",
499
+ " │ ├── document.pdf\n",
500
+ " │ └── etree.xml\n",
501
+ " ├── example_document2\n",
502
+ " │ ├── document.pdf\n",
503
+ " │ └── etree.xml\n",
504
+ " ├── example_document3\n",
505
+ " │ ├── document.pdf\n",
506
+ " │ └── etree.xml\n",
507
+ " └── example_document4\n",
508
+ " ├── document.pdf\n",
509
+ " └── etree.xml\n",
510
+ "```"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "markdown",
515
+ "id": "6c40e426-af77-47fc-a82c-77b5ca4fddeb",
516
+ "metadata": {},
517
+ "source": [
518
+ "Some details about this structure:\n",
519
+ "\n",
520
+ "- Every detail in the token type labels file structure applies for this structure too.\n",
521
+ "- `paragraph_extraction` directory is where your paragraph extraction datasets are located, its name should not be something else.\n",
522
+ "- `token_type` labels are also shown in the structure because token types are used as a feature in the paragraph extraction model. If you do not have it, it will not break the pipeline and still train the model but the token_type feature for every token will be `TokenType.TEXT` in paragraph extractor model's features.\n",
523
+ "- If you do not have `token_type` labels, another option is, after loading the data, you can predict the token types with the token type model (will be shown below)\n"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "markdown",
528
+ "id": "1e234e69-7e50-4ffe-a31b-2dc8248a676f",
529
+ "metadata": {},
530
+ "source": "For labels.json files, they should have this structure:"
531
+ },
532
+ {
533
+ "cell_type": "markdown",
534
+ "id": "472072a6-a02c-4b75-bbc0-f13bb7e357d2",
535
+ "metadata": {},
536
+ "source": [
537
+ "```\n",
538
+ "{\n",
539
+ " \"pages\": [\n",
540
+ " {\n",
541
+ " \"number\": 1,\n",
542
+ " \"labels\": [\n",
543
+ " {\n",
544
+ " \"top\": 86,\n",
545
+ " \"left\": 162,\n",
546
+ " \"width\": 292,\n",
547
+ " \"height\": 24,\n",
548
+ " \"label_type\": 0\n",
549
+ " },\n",
550
+ " {\n",
551
+ " \"top\": 122,\n",
552
+ " \"left\": 221,\n",
553
+ " \"width\": 174,\n",
554
+ " \"height\": 12,\n",
555
+ " \"label_type\": 0\n",
556
+ " }\n",
557
+ " ]\n",
558
+ " },\n",
559
+ " {\n",
560
+ " \"number\": 2,\n",
561
+ " \"labels\": [\n",
562
+ " {\n",
563
+ " \"top\": 36,\n",
564
+ " \"left\": 296,\n",
565
+ " \"width\": 22,\n",
566
+ " \"height\": 13,\n",
567
+ " \"label_type\": 0\n",
568
+ " },\n",
569
+ " {\n",
570
+ " \"top\": 72,\n",
571
+ " \"left\": 71,\n",
572
+ " \"width\": 473,\n",
573
+ " \"height\": 49,\n",
574
+ " \"label_type\": 0\n",
575
+ " }\n",
576
+ " ]\n",
577
+ " }\n",
578
+ " ]\n",
579
+ "}\n",
580
+ "```"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "markdown",
585
+ "id": "bb6e716b-b742-4186-9e1a-ac5ecea708ac",
586
+ "metadata": {},
587
+ "source": [
588
+ "Here you see a list of labels for each page. Each label includes information about the coordinates `top`, `left`, `width`, `height` for each segment/paragraph. So, this time the coordinates belongs to the segments, not to the tokens.\n",
589
+ "\n",
590
+ "As \"label_type\", it should be always 0 since there is only one type \"paragraph\" (don't get confused with this part, it's not important, just put 0 and go on).\n"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "markdown",
595
+ "id": "a2c8a9b3-6180-41f2-bb82-bea892a61f5e",
596
+ "metadata": {},
597
+ "source": "Using this information, you can load your data like this:"
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "id": "cb6ae549-4f52-45b0-853a-6414ca8b4af3",
602
+ "metadata": {},
603
+ "source": [
604
+ "from os.path import join\n",
605
+ "from paragraph_extraction_trainer.PdfParagraphTokens import PdfParagraphTokens\n",
606
+ "\n",
607
+ "\n",
608
+ "def load_paragraph_extraction_labels():\n",
609
+ "\t\n",
610
+ "\tpdf_labeled_data_root_path = \"path/to/pdf/labeled/data\"\n",
611
+ "\tdatasets_path = join(pdf_labeled_data_root_path, \"paragraph_extraction\")\n",
612
+ "\tlabeled_data: list[PdfParagraphTokens] = []\n",
613
+ "\t\n",
614
+ "\tfor dataset in listdir(join(datasets_path)):\n",
615
+ "\t\tif \"train\" not in dataset:\n",
616
+ "\t\t\tcontinue\n",
617
+ "\t\tpdf_paragraph_tokens: PdfParagraphTokens = PdfParagraphTokens.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name)\n",
618
+ "\t\tlabeled_data.append(pdf_paragraph_tokens)\n",
619
+ "\t\n",
620
+ "\treturn labeled_data\n",
621
+ "\n",
622
+ "\n",
623
+ "from adapters.ml.pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer\n",
624
+ "\n",
625
+ "def load_paragraph_extraction_labels():\n",
626
+ "\n",
627
+ " pdf_labeled_data_root_path = \"path/to/pdf/labeled/data\"\n",
628
+ " datasets_path = join(pdf_labeled_data_root_path, \"paragraph_extraction\")\n",
629
+ " labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = []\n",
630
+ " \n",
631
+ " for dataset in listdir(join(datasets_path)):\n",
632
+ " if \"train\" not in dataset:\n",
633
+ " continue\n",
634
+ " pdf_paragraph_tokens: PdfParagraphTokens = PdfParagraphTokens.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name)\n",
635
+ " labeled_pdf_paragraph_tokens_list.append(pdf_paragraph_tokens)\n",
636
+ " \n",
637
+ " \n",
638
+ " token_type_model_configuration = ModelConfiguration()\n",
639
+ " labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
640
+ " trainer = TokenTypeTrainer(labeled_pdf_features_list, model_configuration)\n",
641
+ " \n",
642
+ " \n",
643
+ " return labeled_pdf_paragraph_tokens_list"
644
+ ],
645
+ "outputs": [],
646
+ "execution_count": null
647
+ },
648
+ {
649
+ "cell_type": "markdown",
650
+ "id": "cf3f6a6c-cba7-43c4-9f72-85cbe447cb6e",
651
+ "metadata": {},
652
+ "source": "#### Fine-Tuning the Model"
653
+ },
654
+ {
655
+ "cell_type": "markdown",
656
+ "id": "29dbaba4-d3d6-4985-be44-df872fe9b5d4",
657
+ "metadata": {},
658
+ "source": "Again, to be able to use our trained paragraph extraction model, you should download it from our huggingface repo. You can just run `download_models.py` and both models are going to be downloaded."
659
+ },
660
+ {
661
+ "cell_type": "markdown",
662
+ "id": "8a82f6f6-cec9-48bc-9c64-b09aa65d2754",
663
+ "metadata": {},
664
+ "source": [
665
+ "If you want to download it manually, you can use this link: https://huggingface.co/HURIDOCS/pdf-document-layout-analysis/tree/main\n",
666
+ "\n",
667
+ "After downloading it, place it into `models` directory. The path should be as follows: \n",
668
+ "`~/pdf-document-layout-analysis/models/paragraph_extraction_lightgbm.model`"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "markdown",
673
+ "id": "b95cd2cd-0d41-4518-8576-b1a0d2adc21b",
674
+ "metadata": {},
675
+ "source": "To train the paragraph extraction model from scratch:"
676
+ },
677
+ {
678
+ "cell_type": "code",
679
+ "id": "67948603-80e6-4b42-9ba1-78868fd9f946",
680
+ "metadata": {},
681
+ "source": [
682
+ "from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION\n",
683
+ "\n",
684
+ "\n",
685
+ "def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens]):\n",
686
+ " for pdf_paragraph_tokens in pdf_paragraph_tokens_list:\n",
687
+ " for page in pdf_paragraph_tokens.pdf_features.pages:\n",
688
+ " if not page.tokens:\n",
689
+ " continue\n",
690
+ " for token, next_token in zip(page.tokens, page.tokens[1:]):\n",
691
+ " yield pdf_paragraph_tokens, token, next_token\n",
692
+ " yield pdf_paragraph_tokens, page.tokens[-1], page.tokens[-1]\n",
693
+ "\n",
694
+ "\n",
695
+ "def train_paragraph_extraction_model():\n",
696
+ " labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
697
+ " labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
698
+ " trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
699
+ " \n",
700
+ " train_labels = []\n",
701
+ " for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
702
+ " train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
703
+ "\n",
704
+ " trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels) "
705
+ ],
706
+ "outputs": [],
707
+ "execution_count": null
708
+ },
709
+ {
710
+ "cell_type": "markdown",
711
+ "id": "2e7cd129-874e-415d-9855-401d8c5d0136",
712
+ "metadata": {},
713
+ "source": "And to refit the model with your own data, all you need to do is setting `resume_training` configuration to `True`:"
714
+ },
715
+ {
716
+ "cell_type": "code",
717
+ "id": "37b6b980-deaf-4ba4-baf0-7bf137af63a7",
718
+ "metadata": {},
719
+ "source": [
720
+ "def refit_paragraph_extraction_model():\n",
721
+ " labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
722
+ " labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
723
+ " MODEL_CONFIGURATION.resume_training = True\n",
724
+ " trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
725
+ " \n",
726
+ " train_labels = []\n",
727
+ " for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
728
+ " train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
729
+ "\n",
730
+ " trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels) "
731
+ ],
732
+ "outputs": [],
733
+ "execution_count": null
734
+ },
735
+ {
736
+ "cell_type": "markdown",
737
+ "id": "1389cf49-c163-4f90-ab0c-9606756b8ef9",
738
+ "metadata": {},
739
+ "source": "<font color='red'>[IMPORTANT]</font> If you want to use your own trained models in pdf-document-layout-analysis service, make sure their names are `token_type_lightgbm.model` and `paragraph_extraction_lightgbm.model` and are placed in `models` directory."
740
+ },
741
+ {
742
+ "cell_type": "markdown",
743
+ "id": "b1d4cf8c-65d2-4496-adcf-ab73acc5000f",
744
+ "metadata": {},
745
+ "source": "After finishing training, you can get the predictions of the model like shown in below:"
746
+ },
747
+ {
748
+ "cell_type": "code",
749
+ "id": "69e747aa-9b19-4e8d-acbb-f8d221dfe006",
750
+ "metadata": {},
751
+ "source": [
752
+ "from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration\n",
753
+ "from fast_trainer.model_configuration import MODEL_CONFIGURATION as PARAGRAPH_EXTRACTION_CONFIGURATION\n",
754
+ "from domain.PdfSegment import PdfSegment\n",
755
+ "from adapters.ml.fast_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer\n",
756
+ "\n",
757
+ "def get_predictions():\n",
758
+ " pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(\"test_pdfs/regular.pdf\")\n",
759
+ " # First, use token type model to find and set the types.\n",
760
+ " token_type_trainer = TokenTypeTrainer([pdf_features], ModelConfiguration())\n",
761
+ " token_type_trainer.set_token_types(\"models/token_type_lightgbm.model\")\n",
762
+ " trainer = ParagraphExtractorTrainer(pdfs_features=[pdf_features], model_configuration=PARAGRAPH_EXTRACTION_CONFIGURATION)\n",
763
+ " segments: list[PdfSegment] = trainer.get_pdf_segments(\"models/paragraph_extraction_lightgbm.model\")\n",
764
+ " model_configuration = ModelConfiguration()\n",
765
+ " for segment in segments[:20]:\n",
766
+ " print(f\"\\033[96m{segment.text_content}\\033[0m \\033[93m[{segment.segment_type}]\\033[0m \\033[91m{segment.bounding_box.to_dict()}\\033[0m\")"
767
+ ],
768
+ "outputs": [],
769
+ "execution_count": null
770
+ },
771
+ {
772
+ "cell_type": "markdown",
773
+ "id": "e3af70a1-404e-4bac-a366-f7962636b1eb",
774
+ "metadata": {},
775
+ "source": "Output of the `paragraph_extraction_model` is a list of `PdfSegment` items. Every item includes the information like `page_number`, `text_content`, `segment_type`, `bounding_box`, `pdf_name` for each of the segments. "
776
+ },
777
+ {
778
+ "cell_type": "code",
779
+ "id": "4dc0c106-7b22-42e3-969f-d52ecddae3ae",
780
+ "metadata": {},
781
+ "source": "",
782
+ "outputs": [],
783
+ "execution_count": null
784
+ },
785
+ {
786
+ "cell_type": "markdown",
787
+ "id": "3d5b2376-d983-4c49-8130-b94368782828",
788
+ "metadata": {},
789
+ "source": [
790
+ "```\n",
791
+ "{\n",
792
+ " \"pages\": [\n",
793
+ " {\n",
794
+ " \"number\": 1,\n",
795
+ " \"labels\": [\n",
796
+ " {\n",
797
+ " \"top\": 86,\n",
798
+ " \"left\": 162,\n",
799
+ " \"width\": 292,\n",
800
+ " \"height\": 24,\n",
801
+ " \"label_type\": 0\n",
802
+ " },\n",
803
+ " {\n",
804
+ " \"top\": 122,\n",
805
+ " \"left\": 221,\n",
806
+ " \"width\": 174,\n",
807
+ " \"height\": 12,\n",
808
+ " \"label_type\": 0\n",
809
+ " }\n",
810
+ " ]\n",
811
+ " },\n",
812
+ " {\n",
813
+ " \"number\": 2,\n",
814
+ " \"labels\": [\n",
815
+ " {\n",
816
+ " \"top\": 36,\n",
817
+ " \"left\": 296,\n",
818
+ " \"width\": 22,\n",
819
+ " \"height\": 13,\n",
820
+ " \"label_type\": 0\n",
821
+ " },\n",
822
+ " {\n",
823
+ " \"top\": 72,\n",
824
+ " \"left\": 71,\n",
825
+ " \"width\": 473,\n",
826
+ " \"height\": 49,\n",
827
+ " \"label_type\": 0\n",
828
+ " }\n",
829
+ " ]\n",
830
+ " }\n",
831
+ " ]\n",
832
+ "}\n",
833
+ "```"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "markdown",
838
+ "id": "1972189b-c70b-436d-9830-56adc354b777",
839
+ "metadata": {},
840
+ "source": [
841
+ "Using this information, you can load your data like this:"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "markdown",
846
+ "id": "d6c07ba4-334e-4ff3-8e2f-b2f684f053c9",
847
+ "metadata": {},
848
+ "source": [
849
+ "In case you do not have `token_type` labels and want to find the types with the `token_type_model`, you can use this:"
850
+ ]
851
+ },
852
+ {
853
+ "cell_type": "markdown",
854
+ "id": "41b6bb64-92a2-4b75-95f9-a934c104b7c0",
855
+ "metadata": {},
856
+ "source": [
857
+ "#### Fine-Tuning the Model"
858
+ ]
859
+ },
860
+ {
861
+ "cell_type": "markdown",
862
+ "id": "bd38ced0-2925-4fe5-98ec-b633a19b5ce3",
863
+ "metadata": {},
864
+ "source": [
865
+ "If you want to download it manually, you can use this link: https://huggingface.co/HURIDOCS/pdf-document-layout-analysis/tree/main\n",
866
+ "\n",
867
+ "After downloading it, place it into `models` directory. The path should be as follows: \n",
868
+ "`~/pdf-document-layout-analysis/models/paragraph_extraction_lightgbm.model`"
869
+ ]
870
+ },
871
+ {
872
+ "cell_type": "code",
873
+ "execution_count": 12,
874
+ "id": "60b22be7-35d0-4c34-891e-c67d25942c72",
875
+ "metadata": {},
876
+ "outputs": [],
877
+ "source": [
878
+ "from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION\n",
879
+ "\n",
880
+ "\n",
881
+ "def loop_pdf_paragraph_tokens(pdf_paragraph_tokens_list: list[PdfParagraphTokens]):\n",
882
+ " for pdf_paragraph_tokens in pdf_paragraph_tokens_list:\n",
883
+ " for page in pdf_paragraph_tokens.pdf_features.pages:\n",
884
+ " if not page.tokens:\n",
885
+ " continue\n",
886
+ " for token, next_token in zip(page.tokens, page.tokens[1:]):\n",
887
+ " yield pdf_paragraph_tokens, token, next_token\n",
888
+ " yield pdf_paragraph_tokens, page.tokens[-1], page.tokens[-1]\n",
889
+ "\n",
890
+ "\n",
891
+ "def train_paragraph_extraction_model():\n",
892
+ " labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
893
+ " labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
894
+ " trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
895
+ " \n",
896
+ " train_labels = []\n",
897
+ " for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
898
+ " train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
899
+ "\n",
900
+ " trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels) "
901
+ ]
902
+ },
903
+ {
904
+ "cell_type": "code",
905
+ "execution_count": 13,
906
+ "id": "5a652ca1-b9c7-4731-ba8b-aa98cd0d11a7",
907
+ "metadata": {},
908
+ "outputs": [],
909
+ "source": [
910
+ "def refit_paragraph_extraction_model():\n",
911
+ " labeled_pdf_paragraph_tokens_list: list[PdfParagraphTokens] = load_paragraph_extraction_labels()\n",
912
+ " labeled_pdf_features_list = [pdf_paragraph_tokens.pdf_features for pdf_paragraph_tokens in labeled_pdf_paragraph_tokens_list]\n",
913
+ " MODEL_CONFIGURATION.resume_training = True\n",
914
+ " trainer = ParagraphExtractorTrainer(labeled_pdf_features_list, MODEL_CONFIGURATION)\n",
915
+ " \n",
916
+ " train_labels = []\n",
917
+ " for pdf_paragraph_tokens, token, next_token in loop_pdf_paragraph_tokens([pdf_paragraph_tokens]):\n",
918
+ " train_labels.append(pdf_paragraph_tokens.check_same_paragraph(token, next_token))\n",
919
+ "\n",
920
+ " trainer.train(\"models/paragraph_extraction_example_model.model\", train_labels) "
921
+ ]
922
+ },
923
+ {
924
+ "cell_type": "markdown",
925
+ "id": "0ca5d8ef-7455-4723-af4e-d8c49096251f",
926
+ "metadata": {},
927
+ "source": [
928
+ "After finishing training, you can get the predictions of the model like shown in below:"
929
+ ]
930
+ },
931
+ {
932
+ "cell_type": "markdown",
933
+ "id": "e5a5ab63-7931-40e0-8f51-43e3f3ef5b32",
934
+ "metadata": {},
935
+ "source": [
936
+ "Output of the `paragraph_extraction_model` is a list of `PdfSegment` items. Every item includes the information like `page_number`, `text_content`, `segment_type`, `bounding_box`, `pdf_name` for each of the segments. "
937
+ ]
938
+ }
939
+ ],
940
+ "metadata": {
941
+ "kernelspec": {
942
+ "display_name": "Python 3 (ipykernel)",
943
+ "language": "python",
944
+ "name": "python3"
945
+ },
946
+ "language_info": {
947
+ "codemirror_mode": {
948
+ "name": "ipython",
949
+ "version": 3
950
+ },
951
+ "file_extension": ".py",
952
+ "mimetype": "text/x-python",
953
+ "name": "python",
954
+ "nbconvert_exporter": "python",
955
+ "pygments_lexer": "ipython3",
956
+ "version": "3.11.9"
957
+ }
958
+ },
959
+ "nbformat": 4,
960
+ "nbformat_minor": 5
961
+ }
images/vgtexample1.png ADDED

Git LFS Details

  • SHA256: 4b68017bb1ff60317bc2575db44db7117a245321e2baa34efd24b115748a38ca
  • Pointer size: 131 Bytes
  • Size of remote file: 240 kB
images/vgtexample2.png ADDED

Git LFS Details

  • SHA256: eb2bbb4a4ae5351cf7829b0ba217b21248fd0b92e510e3578c0130952b7573a1
  • Pointer size: 131 Bytes
  • Size of remote file: 256 kB
images/vgtexample3.png ADDED

Git LFS Details

  • SHA256: fae87bba8266250d03815b183f4c5ef3e839998bb9dcd187b99ea87e99384ff1
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
images/vgtexample4.png ADDED

Git LFS Details

  • SHA256: 1a7c9a4fe0d53c57cca52b56a1de98988b9d2ec0a7be25109d120e20f87fa118
  • Pointer size: 131 Bytes
  • Size of remote file: 213 kB
justfile ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HAS_GPU := `command -v nvidia-smi > /dev/null && echo 1 || echo 0`
2
+
3
+ install:
4
+ . .venv/bin/activate; pip install -Ur requirements.txt
5
+
6
+ activate:
7
+ . .venv/bin/activate
8
+
9
+ install_venv:
10
+ python3 -m venv .venv
11
+ . .venv/bin/activate; python -m pip install --upgrade pip
12
+ . .venv/bin/activate; python -m pip install -r dev-requirements.txt
13
+
14
+ formatter:
15
+ . .venv/bin/activate; command black --line-length 125 .
16
+
17
+ check_format:
18
+ . .venv/bin/activate; command black --line-length 125 . --check
19
+
20
+ remove_docker_containers:
21
+ docker compose ps -q | xargs docker rm
22
+
23
+ remove_docker_images:
24
+ docker compose config --images | xargs docker rmi
25
+
26
+ start:
27
+ mkdir -p ./models
28
+ if [ {{HAS_GPU}} -eq 1 ]; then \
29
+ echo "NVIDIA GPU detected, using docker-compose-gpu.yml"; \
30
+ docker compose -f docker-compose-gpu.yml up --build; \
31
+ else \
32
+ echo "No NVIDIA GPU detected, using docker-compose.yml"; \
33
+ docker compose -f docker-compose.yml up --build; \
34
+ fi
35
+
36
+ start_no_gpu:
37
+ mkdir -p ./models
38
+ docker compose up --build
39
+
40
+ stop:
41
+ docker compose stop
42
+
43
+ test:
44
+ . .venv/bin/activate; command cd src; command python -m pytest
45
+
46
+ free_up_space:
47
+ df -h
48
+ sudo rm -rf /usr/share/dotnet
49
+ sudo rm -rf /opt/ghc
50
+ sudo rm -rf "/usr/local/share/boost"
51
+ sudo rm -rf "$AGENT_TOOLSDIRECTORY"
52
+ sudo apt-get remove -y '^llvm-.*' || true
53
+ sudo apt-get remove -y 'php.*' || true
54
+ sudo apt-get remove -y google-cloud-sdk hhvm google-chrome-stable firefox mono-devel || true
55
+ sudo apt-get autoremove -y
56
+ sudo apt-get clean
57
+ sudo rm -rf /usr/share/dotnet
58
+ sudo rm -rf /usr/local/lib/android
59
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
60
+ sudo docker image prune --all --force
61
+ df -h
62
+
63
+ start_detached:
64
+ mkdir -p ./models
65
+ docker compose up --build -d
66
+
67
+ start_detached_gpu:
68
+ mkdir -p ./models
69
+ RESTART_IF_NO_GPU=true docker compose -f docker-compose-gpu.yml up --build -d
70
+
71
+ upgrade:
72
+ . .venv/bin/activate; pip-upgrade
73
+
74
+ tag:
75
+ #!/bin/bash
76
+ # Get current date
77
+ CURRENT_DATE=$(date +%Y.%-m.%-d)
78
+ echo "Current date: $CURRENT_DATE"
79
+
80
+ # Get the latest tag that matches today's date pattern
81
+ LATEST_TAG=$(git tag --list "${CURRENT_DATE}.*" --sort=-version:refname | head -n1)
82
+
83
+ if [ -z "$LATEST_TAG" ]; then
84
+ # No tag for today, start with revision 1
85
+ REVISION=1
86
+ else
87
+ # Extract revision number and increment
88
+ REVISION=$(echo $LATEST_TAG | cut -d. -f4)
89
+ REVISION=$((REVISION + 1))
90
+ fi
91
+
92
+ NEW_TAG="${CURRENT_DATE}.${REVISION}"
93
+ echo "Creating new tag: $NEW_TAG"
94
+ git tag $NEW_TAG
95
+ git push --tag
master_key.py CHANGED
@@ -305,6 +305,7 @@ TABLE_SCHEMAS = {
305
  "orientation": "row1",
306
  "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
307
  "priority": 90,
 
308
  "context_exclusions": ["manager", "operator declaration"]
309
  },
310
  "Audit Declaration dates": {
@@ -368,4 +369,4 @@ PARAGRAPH_PATTERNS = {
368
  "declaration_text": r"I hereby acknowledge and agree with the findings.*",
369
  "introductory_note": r"This audit assesses the.*",
370
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
371
- }
 
305
  "orientation": "row1",
306
  "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
307
  "priority": 90,
308
+ "context_keywords": ["auditor declaration", "NHVR"],
309
  "context_exclusions": ["manager", "operator declaration"]
310
  },
311
  "Audit Declaration dates": {
 
369
  "declaration_text": r"I hereby acknowledge and agree with the findings.*",
370
  "introductory_note": r"This audit assesses the.*",
371
  "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
372
+ }
pyproject.toml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pdf-document-layout-analysis"
3
+ version = "2025.03.18.03"
4
+ description = "This tool is for PDF document layout analysis"
5
+ license = { file = "LICENSE" }
6
+ authors = [{ name = "HURIDOCS" }]
7
+ requires-python = ">= 3.10"
8
+ dependencies = [
9
+ "fastapi==0.111.1",
10
+ "python-multipart==0.0.9",
11
+ "uvicorn==0.30.3",
12
+ "gunicorn==22.0.0",
13
+ "requests==2.32.3",
14
+ "torch==2.4.0",
15
+ "torchvision==0.19.0",
16
+ "timm==1.0.8",
17
+ "Pillow==10.4.0",
18
+ "pdf-annotate==0.12.0",
19
+ "scipy==1.14.0",
20
+ "opencv-python==4.10.0.84",
21
+ "Shapely==2.0.5",
22
+ "transformers==4.40.2",
23
+ "huggingface_hub==0.23.5",
24
+ "pdf2image==1.17.0",
25
+ "lxml==5.2.2",
26
+ "lightgbm==4.5.0",
27
+ "setuptools==75.4.0",
28
+ "roman==4.2",
29
+ "hydra-core==1.3.2",
30
+ "pypandoc==1.13",
31
+ "rapid-latex-ocr==0.0.9",
32
+ "struct_eqtable @ git+https://github.com/UniModal4Reasoning/StructEqTable-Deploy.git@fd06078bfa9364849eb39330c075dd63cbed73ff"
33
+ ]
34
+
35
+ [project.urls]
36
+ HURIDOCS = "https://huridocs.org"
37
+ GitHub = "https://github.com/huridocs/pdf-document-layout-analysis"
38
+ HuggingFace = "https://huggingface.co/HURIDOCS/pdf-document-layout-analysis"
39
+ DockerHub = "https://hub.docker.com/r/huridocs/pdf-document-layout-analysis"
requirements.txt CHANGED
@@ -1,3 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  gradio==4.44.1
2
  pytesseract
3
  python-docx
 
1
+ fastapi==0.111.1
2
+ pydantic==2.11.0
3
+ python-multipart==0.0.9
4
+ uvicorn==0.30.3
5
+ gunicorn==22.0.0
6
+ requests==2.32.3
7
+ torch==2.4.0
8
+ torchvision==0.19.0
9
+ Pillow==10.4.0
10
+ pdf-annotate==0.12.0
11
+ scipy==1.14.0
12
+ opencv-python==4.10.0.84
13
+ Shapely==2.0.5
14
+ transformers==4.40.2
15
+ huggingface_hub==0.23.5
16
+ pdf2image==1.17.0
17
+ lightgbm==4.5.0
18
+ setuptools==75.4.0
19
+ roman==4.2
20
+ hydra-core==1.3.2
21
+ pypandoc==1.13
22
+ rapid-table==2.0.3
23
+ rapidocr==3.2.0
24
+ pix2tex==0.1.4
25
+ latex2mathml==3.78.0
26
+ PyMuPDF==1.25.5
27
+ git+https://github.com/huridocs/pdf-features.git@2025.7.30.1
28
  gradio==4.44.1
29
  pytesseract
30
  python-docx
space-pdf/README.md ADDED
@@ -0,0 +1,910 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">PDF Document Layout Analysis</h1>
2
+ <p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
3
+
4
+ <p align="center">
5
+ <img src="https://img.shields.io/badge/Python-3.10+-blue.svg" alt="Python Version">
6
+ <img src="https://img.shields.io/badge/FastAPI-0.111.1-green.svg" alt="FastAPI">
7
+ <img src="https://img.shields.io/badge/Docker-Ready-blue.svg" alt="Docker">
8
+ <img src="https://img.shields.io/badge/GPU-Supported-orange.svg" alt="GPU Support">
9
+ </p>
10
+
11
+
12
+ <div align="center">
13
+ <p><strong>Built with ❤️ by <a href="https://huridocs.org">HURIDOCS</a></strong></p>
14
+ <p>
15
+ <a href="https://github.com/huridocs/pdf-document-layout-analysis">⭐ Star us on GitHub</a> •
16
+ <a href="https://hub.docker.com/r/huridocs/pdf-document-layout-analysis">🐳 Pull from Docker Hub</a> •
17
+ <a href="https://huggingface.co/HURIDOCS/pdf-document-layout-analysis">🤗 View on Hugging Face</a>
18
+ </p>
19
+ </div>
20
+
21
+
22
+
23
+ ---
24
+
25
+ ## 🚀 Overview
26
+
27
+ This project provides a powerful and flexible PDF analysis microservice built with **Clean Architecture** principles. The service enables OCR, segmentation, and classification of different parts of PDF pages, identifying elements such as texts, titles, pictures, tables, formulas, and more. Additionally, it determines the correct reading order of these identified elements and can convert PDFs to various formats including Markdown and HTML.
28
+
29
+ ### ✨ Key Features
30
+
31
+ - 🔍 **Advanced PDF Layout Analysis** - Segment and classify PDF content with high accuracy
32
+ - 🖼️ **Visual & Fast Models** - Choose between VGT (Vision Grid Transformer) for accuracy or LightGBM for speed
33
+ - 📝 **Multi-format Output** - Export to JSON, Markdown, HTML, and visualize PDF segmentations
34
+ - 🌐 **OCR Support** - 150+ language support with Tesseract OCR
35
+ - 📊 **Table & Formula Extraction** - Extract tables as HTML and formulas as LaTeX
36
+ - 🏗️ **Clean Architecture** - Modular, testable, and maintainable codebase
37
+ - 🐳 **Docker-Ready** - Easy deployment with GPU support
38
+ - ⚡ **RESTful API** - Comprehensive API with 10+ endpoints
39
+
40
+ <table>
41
+ <tr>
42
+ <td>
43
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample1.png"/>
44
+ </td>
45
+ <td>
46
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample2.png"/>
47
+ </td>
48
+ <td>
49
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample3.png"/>
50
+ </td>
51
+ <td>
52
+ <img src="https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/main/images/vgtexample4.png"/>
53
+ </td>
54
+ </tr>
55
+ </table>
56
+
57
+ ### 🔗 Project Links
58
+
59
+ - **GitHub**: [pdf-document-layout-analysis](https://github.com/huridocs/pdf-document-layout-analysis)
60
+ - **HuggingFace**: [pdf-document-layout-analysis](https://huggingface.co/HURIDOCS/pdf-document-layout-analysis)
61
+ - **DockerHub**: [pdf-document-layout-analysis](https://hub.docker.com/r/huridocs/pdf-document-layout-analysis/)
62
+
63
+ ---
64
+
65
+ ## 🚀 Quick Start
66
+
67
+ ### 1. Start the Service
68
+
69
+ **With GPU support (recommended for better performance):**
70
+ ```bash
71
+ make start
72
+ ```
73
+
74
+ **Without GPU support:**
75
+ ```bash
76
+ make start_no_gpu
77
+ ```
78
+
79
+ The service will be available at `http://localhost:5060`
80
+
81
+ **Check service status:**
82
+
83
+ ```bash
84
+ curl http://localhost:5060/info
85
+ ```
86
+
87
+ ### 2. Basic PDF Analysis
88
+
89
+ **Analyze a PDF document (VGT model - high accuracy):**
90
+ ```bash
91
+ curl -X POST -F 'file=@/path/to/your/document.pdf' http://localhost:5060
92
+ ```
93
+
94
+ **Fast analysis (LightGBM models - faster processing):**
95
+ ```bash
96
+ curl -X POST -F 'file=@/path/to/your/document.pdf' -F "fast=true" http://localhost:5060
97
+ ```
98
+
99
+ ### 3. Stop the Service
100
+
101
+ ```bash
102
+ make stop
103
+ ```
104
+
105
+ > 💡 **Tip**: Replace `/path/to/your/document.pdf` with the actual path to your PDF file. The service will return a JSON response with segmented content and metadata.
106
+
107
+
108
+ ## 📋 Table of Contents
109
+
110
+ - [🚀 Quick Start](#-quick-start)
111
+ - [⚙️ Dependencies](#-dependencies)
112
+ - [📋 Requirements](#-requirements)
113
+ - [📚 API Reference](#-api-reference)
114
+ - [💡 Usage Examples](#-usage-examples)
115
+ - [🏗️ Architecture](#-architecture)
116
+ - [🤖 Models](#-models)
117
+ - [📊 Data](#-data)
118
+ - [🔧 Development](#-development)
119
+ - [📈 Benchmarks](#-benchmarks)
120
+ - [Performance](#performance)
121
+ - [Speed](#speed)
122
+ - [🌐 Installation of More Languages for OCR](#-installation-of-more-languages-for-ocr)
123
+ - [🔗 Related Services](#-related-services)
124
+ - [🤝 Contributing](#-contributing)
125
+
126
+
127
+
128
+ ## ⚙️ Dependencies
129
+
130
+ ### Required
131
+ - **Docker Desktop 4.25.0+** - [Installation Guide](https://www.docker.com/products/docker-desktop/)
132
+ - **Python 3.10+** (for local development)
133
+
134
+ ### Optional
135
+ - **NVIDIA Container Toolkit** - [Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (for GPU support)
136
+
137
+ ## 📋 Requirements
138
+
139
+ ### System Requirements
140
+ - **RAM**: 2 GB minimum
141
+ - **GPU Memory**: 5 GB (optional, will fallback to CPU if unavailable)
142
+ - **Disk Space**: 10 GB for models and dependencies
143
+ - **CPU**: Multi-core recommended for better performance
144
+
145
+ ### Docker Requirements
146
+ - Docker Engine 20.10+
147
+ - Docker Compose 2.0+
148
+
149
+ ## 📚 API Reference
150
+
151
+ The service provides a comprehensive RESTful API with the following endpoints:
152
+
153
+ ### Core Analysis Endpoints
154
+
155
+ | Endpoint | Method | Description | Parameters |
156
+ |----------|--------|-------------|------------|
157
+ | `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
158
+ | `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
159
+ | `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
160
+
161
+ ### Content Extraction Endpoints
162
+
163
+ | Endpoint | Method | Description | Parameters |
164
+ |----------|--------|-------------|------------|
165
+ | `/text` | POST | Extract text by content types | `file`, `fast`, `types` |
166
+ | `/toc` | POST | Extract table of contents | `file`, `fast` |
167
+ | `/toc_legacy_uwazi_compatible` | POST | Extract TOC (Uwazi compatible) | `file` |
168
+
169
+ ### Format Conversion Endpoints
170
+
171
+ | Endpoint | Method | Description | Parameters |
172
+ |----------|--------|-------------|------------|
173
+ | `/markdown` | POST | Convert PDF to Markdown (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
174
+ | `/html` | POST | Convert PDF to HTML (includes segmentation data in zip) | `file`, `fast`, `extract_toc`, `dpi`, `output_file` |
175
+ | `/visualize` | POST | Visualize segmentation results on the PDF | `file`, `fast` |
176
+
177
+ ### OCR & Utility Endpoints
178
+
179
+ | Endpoint | Method | Description | Parameters |
180
+ |----------|--------|-------------|------------|
181
+ | `/ocr` | POST | Apply OCR to PDF | `file`, `language` |
182
+ | `/info` | GET | Get service information | - |
183
+ | `/` | GET | Health check and system info | - |
184
+ | `/error` | GET | Test error handling | - |
185
+
186
+ ### Common Parameters
187
+
188
+ - **`file`**: PDF file to process (multipart/form-data)
189
+ - **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
190
+ - **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
191
+ - **`language`**: OCR language code (string, default: "en")
192
+ - **`types`**: Comma-separated content types to extract (string, default: "all")
193
+ - **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
194
+ - **`dpi`**: Image resolution for conversion (integer, default: 120)
195
+
196
+ ## 💡 Usage Examples
197
+
198
+ ### Basic PDF Analysis
199
+
200
+ **Standard analysis with VGT model:**
201
+ ```bash
202
+ curl -X POST \
203
+ -F 'file=@document.pdf' \
204
+ http://localhost:5060
205
+ ```
206
+
207
+ **Fast analysis with LightGBM models:**
208
+ ```bash
209
+ curl -X POST \
210
+ -F 'file=@document.pdf' \
211
+ -F 'fast=true' \
212
+ http://localhost:5060
213
+ ```
214
+
215
+ **Analysis with table and math parsing:**
216
+ ```bash
217
+ curl -X POST \
218
+ -F 'file=@document.pdf' \
219
+ -F 'parse_tables_and_math=true' \
220
+ http://localhost:5060
221
+ ```
222
+
223
+ ### Text Extraction
224
+
225
+ **Extract all text:**
226
+ ```bash
227
+ curl -X POST \
228
+ -F 'file=@document.pdf' \
229
+ -F 'types=all' \
230
+ http://localhost:5060/text
231
+ ```
232
+
233
+ **Extract specific content types:**
234
+ ```bash
235
+ curl -X POST \
236
+ -F 'file=@document.pdf' \
237
+ -F 'types=title,text,table' \
238
+ http://localhost:5060/text
239
+ ```
240
+
241
+ ### Format Conversion
242
+
243
+ **Convert to Markdown:**
244
+ ```bash
245
+ curl -X POST http://localhost:5060/markdown \
246
+ -F 'file=@document.pdf' \
247
+ -F 'extract_toc=true' \
248
+ -F 'output_file=document.md' \
249
+ --output 'document.zip'
250
+ ```
251
+
252
+ **Convert to HTML:**
253
+ ```bash
254
+ curl -X POST http://localhost:5060/html \
255
+ -F 'file=@document.pdf' \
256
+ -F 'extract_toc=true' \
257
+ -F 'output_file=document.html' \
258
+ --output 'document.zip'
259
+ ```
260
+
261
+ > **📋 Segmentation Data**: Format conversion endpoints automatically include detailed segmentation data in the zip output. The resulting zip file contains a `{filename}_segmentation.json` file with information about each detected document segment including:
262
+ > - **Coordinates**: `left`, `top`, `width`, `height`
263
+ > - **Page information**: `page_number`, `page_width`, `page_height`
264
+ > - **Content**: `text` content and segment `type` (e.g., "Title", "Text", "Table", "Picture")
265
+
266
+
267
+ ### OCR Processing
268
+
269
+ **OCR in English:**
270
+ ```bash
271
+ curl -X POST \
272
+ -F 'file=@scanned_document.pdf' \
273
+ -F 'language=en' \
274
+ http://localhost:5060/ocr \
275
+ --output ocr_processed.pdf
276
+ ```
277
+
278
+ **OCR in other languages:**
279
+ ```bash
280
+ # French
281
+ curl -X POST \
282
+ -F 'file=@document_french.pdf' \
283
+ -F 'language=fr' \
284
+ http://localhost:5060/ocr \
285
+ --output ocr_french.pdf
286
+
287
+ # Spanish
288
+ curl -X POST \
289
+ -F 'file=@document_spanish.pdf' \
290
+ -F 'language=es' \
291
+ http://localhost:5060/ocr \
292
+ --output ocr_spanish.pdf
293
+ ```
294
+
295
+ ### Visualization
296
+
297
+ **Generate visualization PDF:**
298
+ ```bash
299
+ curl -X POST \
300
+ -F 'file=@document.pdf' \
301
+ http://localhost:5060/visualize \
302
+ --output visualization.pdf
303
+ ```
304
+
305
+ ### Table of Contents Extraction
306
+
307
+ **Extract structured TOC:**
308
+ ```bash
309
+ curl -X POST \
310
+ -F 'file=@document.pdf' \
311
+ http://localhost:5060/toc
312
+ ```
313
+
314
+ ### XML Storage and Retrieval
315
+
316
+ **Analyze and save XML:**
317
+ ```bash
318
+ curl -X POST \
319
+ -F 'file=@document.pdf' \
320
+ http://localhost:5060/save_xml/my_analysis
321
+ ```
322
+
323
+ **Retrieve saved XML:**
324
+ ```bash
325
+ curl http://localhost:5060/get_xml/my_analysis.xml
326
+ ```
327
+
328
+ ### Service Information
329
+
330
+ **Get service info and supported languages:**
331
+ ```bash
332
+ curl http://localhost:5060/info
333
+ ```
334
+
335
+ **Health check:**
336
+ ```bash
337
+ curl http://localhost:5060/
338
+ ```
339
+
340
+ ### Response Format
341
+
342
+ Most endpoints return JSON with segment information:
343
+
344
+ ```json
345
+ [
346
+ {
347
+ "left": 72.0,
348
+ "top": 84.0,
349
+ "width": 451.2,
350
+ "height": 23.04,
351
+ "page_number": 1,
352
+ "page_width": 595.32,
353
+ "page_height": 841.92,
354
+ "text": "Document Title",
355
+ "type": "Title"
356
+ },
357
+ {
358
+ "left": 72.0,
359
+ "top": 120.0,
360
+ "width": 451.2,
361
+ "height": 200.0,
362
+ "page_number": 1,
363
+ "page_width": 595.32,
364
+ "page_height": 841.92,
365
+ "text": "This is the main text content...",
366
+ "type": "Text"
367
+ }
368
+ ]
369
+ ```
370
+
371
+ ### Supported Content Types
372
+
373
+ - `Caption` - Image and table captions
374
+ - `Footnote` - Footnote text
375
+ - `Formula` - Mathematical formulas
376
+ - `List item` - List items and bullet points
377
+ - `Page footer` - Footer content
378
+ - `Page header` - Header content
379
+ - `Picture` - Images and figures
380
+ - `Section header` - Section headings
381
+ - `Table` - Table content
382
+ - `Text` - Regular text paragraphs
383
+ - `Title` - Document and section titles
384
+
385
+
386
+ ## 🏗️ Architecture
387
+
388
+ This project follows **Clean Architecture** principles, ensuring separation of concerns, testability, and maintainability. The codebase is organized into distinct layers:
389
+
390
+ ### Directory Structure
391
+
392
+ ```
393
+ src/
394
+ ├── domain/ # Enterprise Business Rules
395
+ │ ├── PdfImages.py # PDF image handling domain logic
396
+ │ ├── PdfSegment.py # PDF segment entity
397
+ │ ├── Prediction.py # ML prediction entity
398
+ │ └── SegmentBox.py # Core segment box entity
399
+ ├── use_cases/ # Application Business Rules
400
+ │ ├── pdf_analysis/ # PDF analysis use case
401
+ │ ├── text_extraction/ # Text extraction use case
402
+ │ ├── toc_extraction/ # Table of contents extraction
403
+ │ ├── visualization/ # PDF visualization use case
404
+ │ ├── ocr/ # OCR processing use case
405
+ │ ├── markdown_conversion/ # Markdown conversion use case
406
+ │ └── html_conversion/ # HTML conversion use case
407
+ ├── adapters/ # Interface Adapters
408
+ │ ├── infrastructure/ # External service adapters
409
+ │ ├── ml/ # Machine learning model adapters
410
+ │ ├── storage/ # File storage adapters
411
+ │ └── web/ # Web framework adapters
412
+ ├── ports/ # Interface definitions
413
+ │ ├── services/ # Service interfaces
414
+ │ └── repositories/ # Repository interfaces
415
+ └── drivers/ # Frameworks & Drivers
416
+ └── web/ # FastAPI application setup
417
+ ```
418
+
419
+ ### Layer Responsibilities
420
+
421
+ - **Domain Layer**: Contains core business entities and rules independent of external concerns
422
+ - **Use Cases Layer**: Orchestrates domain entities to fulfill specific application requirements
423
+ - **Adapters Layer**: Implements interfaces defined by inner layers and adapts external frameworks
424
+ - **Drivers Layer**: Contains frameworks, databases, and external agency configurations
425
+
426
+ ### Key Benefits
427
+
428
+ - 🔄 **Dependency Inversion**: High-level modules don't depend on low-level modules
429
+ - 🧪 **Testability**: Easy to unit test business logic in isolation
430
+ - 🔧 **Maintainability**: Changes to external frameworks don't affect business rules
431
+ - 📈 **Scalability**: Easy to add new features without modifying existing code
432
+
433
+
434
+ ## 🤖 Models
435
+
436
+ The service offers two complementary model approaches, each optimized for different use cases:
437
+
438
+ ### 1. Vision Grid Transformer (VGT) - High Accuracy Model
439
+
440
+ **Overview**: A state-of-the-art visual model developed by Alibaba Research Group that "sees" the entire page layout.
441
+
442
+ **Key Features**:
443
+ - 🎯 **High Accuracy**: Best-in-class performance on document layout analysis
444
+ - 👁️ **Visual Understanding**: Analyzes the entire page context including spatial relationships
445
+ - 📊 **Trained on DocLayNet**: Uses the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet)
446
+ - 🔬 **Research-Backed**: Based on [Advanced Literate Machinery](https://github.com/AlibabaResearch/AdvancedLiterateMachinery)
447
+
448
+ **Resource Requirements**:
449
+ - GPU: 5GB+ VRAM (recommended)
450
+ - CPU: Falls back automatically if GPU unavailable
451
+ - Processing Speed: ~1.75 seconds/page (GPU [GTX 1070]) or ~13.5 seconds/page (CPU [i7-8700])
452
+
453
+ ### 2. LightGBM Models - Fast & Efficient
454
+
455
+ **Overview**: Lightweight ensemble of two specialized models using XML-based features from Poppler.
456
+
457
+ **Key Features**:
458
+ - ⚡ **High Speed**: ~0.42 seconds per page on CPU (i7-8700)
459
+ - 💾 **Low Resource Usage**: CPU-only, minimal memory footprint
460
+ - 🔄 **Dual Model Approach**:
461
+ - **Token Type Classifier**: Identifies content types (title, text, table, etc.)
462
+ - **Segmentation Model**: Determines proper content boundaries
463
+ - 📄 **XML-Based**: Uses Poppler's PDF-to-XML conversion for feature extraction
464
+
465
+ **Trade-offs**:
466
+ - Slightly lower accuracy compared to VGT
467
+ - No visual context understanding
468
+ - Excellent for batch processing and resource-constrained environments
469
+
470
+ ### OCR Integration
471
+
472
+ Both models integrate seamlessly with OCR capabilities:
473
+
474
+ - **Engine**: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
475
+ - **Processing**: [ocrmypdf](https://ocrmypdf.readthedocs.io/en/latest/index.html)
476
+ - **Languages**: 150+ supported languages
477
+ - **Output**: Searchable PDFs with preserved layout
478
+
479
+ ### Model Selection Guide
480
+
481
+ | Use Case | Recommended Model | Reason |
482
+ |----------|------------------|---------|
483
+ | High accuracy requirements | VGT | Superior visual understanding |
484
+ | Batch processing | LightGBM | Faster processing, lower resources |
485
+ | GPU available | VGT | Leverages GPU acceleration |
486
+ | CPU-only environment | LightGBM | Optimized for CPU processing |
487
+ | Real-time applications | LightGBM | Consistent fast response times |
488
+ | Research/analysis | VGT | Best accuracy for detailed analysis |
489
+
490
+ ## 📊 Data
491
+
492
+ ### Training Dataset
493
+
494
+ Both model types are trained on the comprehensive [DocLayNet dataset](https://github.com/DS4SD/DocLayNet), a large-scale document layout analysis dataset containing over 80,000 document pages.
495
+
496
+ ### Document Categories
497
+
498
+ The models can identify and classify 11 distinct content types:
499
+
500
+ | ID | Category | Description |
501
+ |----|----------|-------------|
502
+ | 1 | **Caption** | Image and table captions |
503
+ | 2 | **Footnote** | Footnote references and text |
504
+ | 3 | **Formula** | Mathematical equations and formulas |
505
+ | 4 | **List item** | Bulleted and numbered list items |
506
+ | 5 | **Page footer** | Footer content and page numbers |
507
+ | 6 | **Page header** | Header content and titles |
508
+ | 7 | **Picture** | Images, figures, and graphics |
509
+ | 8 | **Section header** | Section and subsection headings |
510
+ | 9 | **Table** | Tabular data and structures |
511
+ | 10 | **Text** | Regular paragraph text |
512
+ | 11 | **Title** | Document and chapter titles |
513
+
514
+ ### Dataset Characteristics
515
+
516
+ - **Domain Coverage**: Academic papers, technical documents, reports
517
+ - **Language**: Primarily English with multilingual support
518
+ - **Quality**: High-quality annotations with bounding boxes and labels
519
+ - **Diversity**: Various document layouts, fonts, and formatting styles
520
+
521
+ For detailed information about the dataset, visit the [DocLayNet repository](https://github.com/DS4SD/DocLayNet).
522
+
523
+ ## 🔧 Development
524
+
525
+ ### Local Development Setup
526
+
527
+ 1. **Clone the repository:**
528
+ ```bash
529
+ git clone https://github.com/huridocs/pdf-document-layout-analysis.git
530
+ cd pdf-document-layout-analysis
531
+ ```
532
+
533
+ 2. **Create virtual environment:**
534
+ ```bash
535
+ make install_venv
536
+ ```
537
+
538
+ 3. **Activate environment:**
539
+ ```bash
540
+ make activate
541
+ # or manually: source .venv/bin/activate
542
+ ```
543
+
544
+ 4. **Install dependencies:**
545
+ ```bash
546
+ make install
547
+ ```
548
+
549
+ ### Code Quality
550
+
551
+ **Format code:**
552
+ ```bash
553
+ make formatter
554
+ ```
555
+
556
+ **Check formatting:**
557
+ ```bash
558
+ make check_format
559
+ ```
560
+
561
+ ### Testing
562
+
563
+ **Run tests:**
564
+ ```bash
565
+ make test
566
+ ```
567
+
568
+ **Integration tests:**
569
+ ```bash
570
+ # Tests are located in src/tests/integration/
571
+ python -m pytest src/tests/integration/test_end_to_end.py
572
+ ```
573
+
574
+ ### Docker Development
575
+
576
+ **Build and start (detached mode):**
577
+ ```bash
578
+ # With GPU
579
+ make start_detached_gpu
580
+
581
+ # Without GPU
582
+ make start_detached
583
+ ```
584
+
585
+ **Clean up Docker resources:**
586
+ ```bash
587
+ # Remove containers
588
+ make remove_docker_containers
589
+
590
+ # Remove images
591
+ make remove_docker_images
592
+ ```
593
+
594
+ ### Project Structure
595
+
596
+ ```
597
+ pdf-document-layout-analysis/
598
+ ├── src/ # Source code
599
+ │ ├── domain/ # Business entities
600
+ │ ├── use_cases/ # Application logic
601
+ │ ├── adapters/ # External integrations
602
+ │ ├── ports/ # Interface definitions
603
+ │ └── drivers/ # Framework configurations
604
+ ├── test_pdfs/ # Test PDF files
605
+ ├── models/ # ML model storage
606
+ ├── docker-compose.yml # Docker configuration
607
+ ├── Dockerfile # Container definition
608
+ ├── Makefile # Development commands
609
+ ├── pyproject.toml # Python project configuration
610
+ └── requirements.txt # Python dependencies
611
+ ```
612
+
613
+ ### Environment Variables
614
+
615
+ Key configuration options:
616
+
617
+ ```bash
618
+ # OCR configuration
619
+ OCR_SOURCE=/tmp/ocr_source
620
+
621
+ # Model paths (auto-configured)
622
+ MODELS_PATH=./models
623
+
624
+ # Service configuration
625
+ HOST=0.0.0.0
626
+ PORT=5060
627
+ ```
628
+
629
+ ### Adding New Features
630
+
631
+ 1. **Domain Logic**: Add entities in `src/domain/`
632
+ 2. **Use Cases**: Implement business logic in `src/use_cases/`
633
+ 3. **Adapters**: Create integrations in `src/adapters/`
634
+ 4. **Ports**: Define interfaces in `src/ports/`
635
+ 5. **Controllers**: Add endpoints in `src/adapters/web/`
636
+
637
+ ### Debugging
638
+
639
+ **View logs:**
640
+ ```bash
641
+ docker compose logs -f
642
+ ```
643
+
644
+ **Access container:**
645
+ ```bash
646
+ docker exec -it pdf-document-layout-analysis /bin/bash
647
+ ```
648
+
649
+ **Free up disk space:**
650
+ ```bash
651
+ make free_up_space
652
+ ```
653
+
654
+ ### Order of Output Elements
655
+
656
+ The service returns SegmentBox elements in a carefully determined reading order:
657
+
658
+ #### Reading Order Algorithm
659
+
660
+ 1. **Poppler Integration**: Uses [Poppler](https://poppler.freedesktop.org) PDF-to-XML conversion to establish initial token reading order
661
+ 2. **Segment Averaging**: Calculates average reading order for multi-token segments
662
+ 3. **Type-Based Sorting**: Prioritizes content types:
663
+ - **Headers** placed first
664
+ - **Main content** in reading order
665
+ - **Footers and footnotes** placed last
666
+
667
+ #### Non-Text Elements
668
+
669
+ For segments without text (e.g., images):
670
+ - Processed after text-based sorting
671
+ - Positioned based on nearest text segment proximity
672
+ - Uses spatial distance as the primary criterion
673
+
674
+ ### Advanced Table and Formula Extraction
675
+
676
+ #### Default Behavior
677
+ - **Formulas**: Automatically extracted as LaTeX format in the `text` property
678
+ - **Tables**: Basic text extraction included by default
679
+
680
+ #### Enhanced Table Extraction
681
+
682
+ Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
683
+
684
+ ```bash
685
+ curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
686
+ ```
687
+
688
+
689
+ #### Extraction Engines
690
+ - **Formulas**: [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
691
+ - **Tables**: [RapidTable](https://github.com/RapidAI/RapidTable)
692
+
693
+
694
+ ## 📈 Benchmarks
695
+
696
+ ### Performance
697
+
698
+ VGT model performance on PubLayNet dataset:
699
+
700
+ | Metric | Overall | Text | Title | List | Table | Figure |
701
+ |--------|---------|------|-------|------|-------|--------|
702
+ | **F1 Score** | **0.962** | 0.950 | 0.939 | 0.968 | 0.981 | 0.971 |
703
+
704
+ > 📊 **Comparison**: View comprehensive model comparisons at [Papers With Code](https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val)
705
+
706
+ ### Speed
707
+
708
+ Performance benchmarks on 15-page academic documents:
709
+
710
+ | Model | Hardware | Speed (sec/page) | Use Case |
711
+ |-------|----------|------------------|----------|
712
+ | **LightGBM** | CPU (i7-8700 3.2GHz) | **0.42** | Fast processing |
713
+ | **VGT** | GPU (GTX 1070) | **1.75** | High accuracy |
714
+ | **VGT** | CPU (i7-8700 3.2GHz) | 13.5 | CPU fallback |
715
+
716
+ ### Performance Recommendations
717
+
718
+ - **GPU Available**: Use VGT for best accuracy-speed balance
719
+ - **CPU Only**: Use LightGBM for optimal performance
720
+ - **Batch Processing**: LightGBM for consistent throughput
721
+ - **High Accuracy**: VGT with GPU for best results
722
+
723
+
724
+ ## 🌐 Installation of More Languages for OCR
725
+
726
+ The service uses Tesseract OCR with support for 150+ languages. The Docker image includes only common languages to minimize image size.
727
+
728
+ ### Installing Additional Languages
729
+
730
+ #### 1. Access the Container
731
+ ```bash
732
+ docker exec -it --user root pdf-document-layout-analysis /bin/bash
733
+ ```
734
+
735
+ #### 2. Install Language Packs
736
+ ```bash
737
+ # Install specific language
738
+ apt-get update
739
+ apt-get install tesseract-ocr-[LANGCODE]
740
+ ```
741
+
742
+ #### 3. Common Language Examples
743
+
744
+ ```bash
745
+ # Korean
746
+ apt-get install tesseract-ocr-kor
747
+
748
+ # German
749
+ apt-get install tesseract-ocr-deu
750
+
751
+ # French
752
+ apt-get install tesseract-ocr-fra
753
+
754
+ # Spanish
755
+ apt-get install tesseract-ocr-spa
756
+
757
+ # Chinese Simplified
758
+ apt-get install tesseract-ocr-chi-sim
759
+
760
+ # Arabic
761
+ apt-get install tesseract-ocr-ara
762
+
763
+ # Japanese
764
+ apt-get install tesseract-ocr-jpn
765
+ ```
766
+
767
+ #### 4. Verify Installation
768
+
769
+ ```bash
770
+ curl http://localhost:5060/info
771
+ ```
772
+
773
+ ### Language Code Reference
774
+
775
+ Find Tesseract language codes in the [ISO to Tesseract mapping](https://github.com/huridocs/pdf-document-layout-analysis/blob/main/src/adapters/infrastructure/ocr/languages.py).
776
+
777
+ ### Supported Languages
778
+
779
+ Common language codes:
780
+ - `eng` - English
781
+ - `fra` - French
782
+ - `deu` - German
783
+ - `spa` - Spanish
784
+ - `ita` - Italian
785
+ - `por` - Portuguese
786
+ - `rus` - Russian
787
+ - `chi-sim` - Chinese Simplified
788
+ - `chi-tra` - Chinese Traditional
789
+ - `jpn` - Japanese
790
+ - `kor` - Korean
791
+ - `ara` - Arabic
792
+ - `hin` - Hindi
793
+
794
+ ### Usage with Multiple Languages
795
+
796
+ ```bash
797
+ # OCR with specific language
798
+ curl -X POST \
799
+ -F 'file=@document.pdf' \
800
+ -F 'language=fr' \
801
+ http://localhost:5060/ocr \
802
+ --output french_ocr.pdf
803
+ ```
804
+
805
+
806
+ ## 🔗 Related Services
807
+
808
+ Explore our ecosystem of PDF processing services built on this foundation:
809
+
810
+ ### [PDF Table of Contents Extractor](https://github.com/huridocs/pdf-table-of-contents-extractor)
811
+ 🔍 **Purpose**: Intelligent extraction of structured table of contents from PDF documents
812
+
813
+ **Key Features**:
814
+ - Leverages layout analysis for accurate TOC identification
815
+ - Hierarchical structure recognition
816
+ - Multiple output formats supported
817
+ - Integration-ready API
818
+
819
+ ### [PDF Text Extraction](https://github.com/huridocs/pdf-text-extraction)
820
+ 📝 **Purpose**: Advanced text extraction with layout awareness
821
+
822
+ **Key Features**:
823
+ - Content-type aware extraction
824
+ - Preserves document structure
825
+ - Reading order optimization
826
+ - Clean text output with metadata
827
+
828
+ ### Integration Benefits
829
+
830
+ These services work seamlessly together:
831
+ - **Shared Analysis**: Reuse layout analysis results across services
832
+ - **Consistent Output**: Standardized JSON format for easy integration
833
+ - **Scalable Architecture**: Deploy services independently or together
834
+ - **Docker Ready**: All services containerized for easy deployment
835
+
836
+ ## 🤝 Contributing
837
+
838
+ We welcome contributions to improve the PDF Document Layout Analysis service!
839
+
840
+ ### How to Contribute
841
+
842
+ 1. **Fork the Repository**
843
+ ```bash
844
+ git clone https://github.com/your-username/pdf-document-layout-analysis.git
845
+ ```
846
+
847
+ 2. **Create a Feature Branch**
848
+ ```bash
849
+ git checkout -b feature/your-feature-name
850
+ ```
851
+
852
+ 3. **Set Up Development Environment**
853
+ ```bash
854
+ make install_venv
855
+ make install
856
+ ```
857
+
858
+ 4. **Make Your Changes**
859
+ - Follow the Clean Architecture principles
860
+ - Add tests for new features
861
+ - Update documentation as needed
862
+
863
+ 5. **Run Tests and Quality Checks**
864
+ ```bash
865
+ make test
866
+ make check_format
867
+ ```
868
+
869
+ 6. **Submit a Pull Request**
870
+ - Provide clear description of changes
871
+ - Include test results
872
+ - Reference any related issues
873
+
874
+ ### Contribution Guidelines
875
+
876
+ #### Code Standards
877
+ - **Python**: Follow PEP 8 with 125-character line length
878
+ - **Architecture**: Maintain Clean Architecture boundaries
879
+ - **Testing**: Include unit tests for new functionality
880
+ - **Documentation**: Update README and docstrings
881
+
882
+ #### Areas for Contribution
883
+
884
+ - 🐛 **Bug Fixes**: Report and fix issues
885
+ - ✨ **New Features**: Add new endpoints or functionality
886
+ - 📚 **Documentation**: Improve guides and examples
887
+ - 🧪 **Testing**: Expand test coverage
888
+ - 🚀 **Performance**: Optimize processing speed
889
+ - 🌐 **Internationalization**: Add language support
890
+
891
+ #### Development Workflow
892
+
893
+ 1. **Issue First**: Create or comment on relevant issues
894
+ 2. **Small PRs**: Keep pull requests focused and manageable
895
+ 3. **Clean Commits**: Use descriptive commit messages
896
+ 4. **Documentation**: Update relevant documentation
897
+ 5. **Testing**: Ensure all tests pass
898
+
899
+ ### Getting Help
900
+
901
+ - 📚 **Documentation**: Check this README and inline docs
902
+ - 💬 **Issues**: Search existing issues or create new ones
903
+ - 🔍 **Code**: Explore the codebase structure
904
+ - 📧 **Contact**: Reach out to maintainers for guidance
905
+
906
+ ---
907
+
908
+ ### License
909
+
910
+ This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
space-pdf/app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ SCRIPT_DIR = Path(__file__).resolve().parent
9
+
10
+ def run_cmd(cmd, cwd=None, env=None):
11
+ """Run a command, print nice logs, and also save them to run.log in cwd."""
12
+ cwd = str(cwd or os.getcwd())
13
+ print(f"🟦 Running: {' '.join(cmd)} (cwd={cwd})")
14
+ proc = subprocess.run(
15
+ cmd,
16
+ cwd=cwd,
17
+ env=env,
18
+ capture_output=True,
19
+ text=True
20
+ )
21
+ if proc.stdout:
22
+ print("🟩 STDOUT:")
23
+ print(proc.stdout)
24
+ if proc.stderr:
25
+ print("🟥 STDERR:")
26
+ print(proc.stderr)
27
+ # Save to run.log for debugging
28
+ try:
29
+ runlog = Path(cwd) / "run.log"
30
+ with open(runlog, "a", encoding="utf-8") as f:
31
+ f.write(f"$ {' '.join(cmd)}\n")
32
+ if proc.stdout:
33
+ f.write(proc.stdout + "\n")
34
+ if proc.stderr:
35
+ f.write(proc.stderr + "\n")
36
+ print(f"🧾 Run log saved to: {runlog}")
37
+ except Exception as e:
38
+ print(f"⚠️ Could not write run.log: {e}")
39
+
40
+ if proc.returncode != 0:
41
+ # Let Gradio see the failure so it surfaces properly
42
+ raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
43
+ return proc
44
+
45
+ def _locate_pdf_json(temp_dir: str) -> str:
46
+ """
47
+ Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
48
+ Find it (and a few common fallbacks). Raise if not found.
49
+ """
50
+ td = Path(temp_dir)
51
+
52
+ # Prefer exactly-named file if present
53
+ candidates = [
54
+ td / "pdf_data.json", # legacy name (if ever created)
55
+ td / "input_comprehensive_data.json", # most common from your logs
56
+ td / "comprehensive_data.json", # another common alias
57
+ td / "output.json", # generic
58
+ ]
59
+ for p in candidates:
60
+ if p.exists():
61
+ print(f"✅ Using PDF JSON: {p}")
62
+ return str(p)
63
+
64
+ # Generic pattern: anything *_comprehensive_data.json
65
+ globs = list(td.glob("*_comprehensive_data.json"))
66
+ if globs:
67
+ print(f"✅ Using PDF JSON (glob): {globs[0]}")
68
+ return str(globs[0])
69
+
70
+ # If still not found, surface a helpful error
71
+ searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
72
+ raise FileNotFoundError(
73
+ f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
74
+ )
75
+
76
+ def process_files(pdf_file, word_file):
77
+ # Create a unique temporary directory for this run
78
+ temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
79
+ print(f"📂 Temp dir: {temp_dir}")
80
+
81
+ # Define standard filenames for use in the pipeline
82
+ pdf_path = os.path.join(temp_dir, "input.pdf")
83
+ word_path = os.path.join(temp_dir, "input.docx")
84
+ word_json_path = os.path.join(temp_dir, "word_data.json")
85
+ updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
86
+ final_docx_path = os.path.join(temp_dir, "updated.docx")
87
+
88
+ # Copy the uploaded files to the temp directory
89
+ shutil.copy(pdf_file, pdf_path)
90
+ print(f"📄 PDF copied to: {pdf_path}")
91
+ shutil.copy(word_file, word_path)
92
+ print(f"📝 DOCX copied to: {word_path}")
93
+
94
+ # 1) PDF → JSON (extractor writes <stem>_comprehensive_data.json into cwd)
95
+ run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)
96
+
97
+ # Find the JSON produced by the extractor
98
+ pdf_json_path = _locate_pdf_json(temp_dir)
99
+
100
+ # 2) DOCX red text → JSON
101
+ run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)
102
+
103
+ # 3) Merge JSON (uses the resolved pdf_json_path)
104
+ run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)
105
+
106
+ # 4) Apply updates to DOCX
107
+ run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)
108
+
109
+ # Return the final .docx file
110
+ return final_docx_path
111
+
112
+ iface = gr.Interface(
113
+ fn=process_files,
114
+ inputs=[
115
+ gr.File(label="Upload PDF File", type="filepath"),
116
+ gr.File(label="Upload Word File", type="filepath")
117
+ ],
118
+ outputs=gr.File(label="Download Updated Word File"),
119
+ title="Red Text Replacer",
120
+ description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
121
+ )
122
+
123
+ if __name__ == "__main__":
124
+ iface.launch()
space-pdf/extract_pdf_data.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fixed PDF Data Extractor - Addresses key issues in comprehensive_extract.py
4
+
5
+ Key fixes:
6
+ 1. Better table extraction and cleaning
7
+ 2. Improved key-value pair extraction
8
+ 3. More robust text processing
9
+ 4. Enhanced vehicle registration extraction
10
+ 5. Better date/number pattern recognition
11
+ """
12
+
13
+ import json
14
+ import re
15
+ import pandas as pd
16
+ from typing import Dict, List, Any, Optional
17
+ import logging
18
+ from pathlib import Path
19
+ import sys
20
+ from datetime import datetime
21
+
22
+ try:
23
+ import pdfplumber
24
+ HAS_PDFPLUMBER = True
25
+ except ImportError:
26
+ HAS_PDFPLUMBER = False
27
+
28
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
+ logger = logging.getLogger("fixed_pdf_extractor")
30
+
31
+ class FixedPDFExtractor:
32
+ def __init__(self):
33
+ logger.info("🚀 Initializing Fixed PDF Extractor")
34
+
35
+ def extract_everything(self, pdf_path: str) -> Dict[str, Any]:
36
+ if not HAS_PDFPLUMBER:
37
+ raise RuntimeError("pdfplumber is required. Install with: pip install pdfplumber")
38
+
39
+ logger.info(f"📖 Processing PDF: {pdf_path}")
40
+ result = {
41
+ "document_info": {
42
+ "filename": Path(pdf_path).name,
43
+ "total_pages": 0,
44
+ "extraction_timestamp": datetime.now().isoformat()
45
+ },
46
+ "extracted_data": {
47
+ "all_text_content": [],
48
+ "all_tables": [],
49
+ "key_value_pairs": {},
50
+ "audit_information": {},
51
+ "operator_information": {},
52
+ "vehicle_registrations": [],
53
+ "driver_records": [],
54
+ "compliance_summary": {},
55
+ "dates_and_numbers": {}
56
+ }
57
+ }
58
+
59
+ all_text_blocks, all_tables = [], []
60
+
61
+ with pdfplumber.open(pdf_path) as pdf:
62
+ result["document_info"]["total_pages"] = len(pdf.pages)
63
+
64
+ for page_num, page in enumerate(pdf.pages, 1):
65
+ logger.info(f"📄 Processing page {page_num}")
66
+
67
+ # Extract text with better handling
68
+ page_text = self._extract_page_text(page)
69
+ if page_text:
70
+ all_text_blocks.append({
71
+ "page": page_num,
72
+ "text": page_text,
73
+ "word_count": len(page_text.split())
74
+ })
75
+
76
+ # Extract tables with improved cleaning
77
+ tables = self._extract_page_tables(page, page_num)
78
+ all_tables.extend(tables)
79
+
80
+ result["extracted_data"]["all_text_content"] = all_text_blocks
81
+ result["extracted_data"]["all_tables"] = all_tables
82
+
83
+ # Process extracted data with improved methods
84
+ combined_text = "\n\n".join(b["text"] for b in all_text_blocks)
85
+
86
+ result["extracted_data"]["key_value_pairs"] = self._extract_key_value_pairs_improved(combined_text)
87
+ result["extracted_data"]["audit_information"] = self._extract_audit_info(combined_text, all_tables)
88
+ result["extracted_data"]["operator_information"] = self._extract_operator_info(combined_text, all_tables)
89
+ result["extracted_data"]["vehicle_registrations"] = self._extract_vehicle_registrations(all_tables)
90
+ result["extracted_data"]["driver_records"] = self._extract_driver_records(all_tables)
91
+ result["extracted_data"]["compliance_summary"] = self._extract_compliance_summary(combined_text, all_tables)
92
+ result["extracted_data"]["dates_and_numbers"] = self._extract_dates_and_numbers_improved(combined_text)
93
+
94
+ # Generate summary
95
+ result["extraction_summary"] = {
96
+ "text_blocks_found": len(all_text_blocks),
97
+ "tables_found": len(all_tables),
98
+ "key_value_pairs_found": len(result["extracted_data"]["key_value_pairs"]),
99
+ "vehicle_registrations_found": len(result["extracted_data"]["vehicle_registrations"]),
100
+ "driver_records_found": len(result["extracted_data"]["driver_records"]),
101
+ "total_characters": len(combined_text),
102
+ "processing_timestamp": datetime.now().isoformat()
103
+ }
104
+
105
+ logger.info("✅ Extraction completed!")
106
+ return result
107
+
108
+ def _extract_page_text(self, page) -> Optional[str]:
109
+ """Extract text from page with better handling"""
110
+ try:
111
+ text = page.extract_text()
112
+ if text:
113
+ # Clean up text
114
+ text = re.sub(r'[ \t]+', ' ', text.strip())
115
+ text = re.sub(r'\n\s*\n', '\n', text)
116
+ return text
117
+ except Exception as e:
118
+ logger.warning(f"Failed to extract text from page: {e}")
119
+ return None
120
+
121
+ def _extract_page_tables(self, page, page_num: int) -> List[Dict]:
122
+ """Extract tables with improved processing"""
123
+ tables = []
124
+ try:
125
+ raw_tables = page.extract_tables()
126
+ if raw_tables:
127
+ for table_idx, table in enumerate(raw_tables):
128
+ cleaned_table = self._clean_table_improved(table)
129
+ if cleaned_table and len(cleaned_table) > 0:
130
+ tables.append({
131
+ "page": page_num,
132
+ "table_index": table_idx + 1,
133
+ "headers": cleaned_table[0] if cleaned_table else [],
134
+ "data": cleaned_table[1:] if len(cleaned_table) > 1 else [],
135
+ "raw_data": cleaned_table,
136
+ "row_count": len(cleaned_table) - 1 if len(cleaned_table) > 1 else 0,
137
+ "column_count": len(cleaned_table[0]) if cleaned_table else 0
138
+ })
139
+ except Exception as e:
140
+ logger.warning(f"Failed to extract tables from page {page_num}: {e}")
141
+
142
+ return tables
143
+
144
+ def _clean_table_improved(self, table: List[List]) -> List[List[str]]:
145
+ """Improved table cleaning with better cell processing"""
146
+ if not table:
147
+ return []
148
+
149
+ cleaned = []
150
+ for row in table:
151
+ cleaned_row = []
152
+ for cell in row:
153
+ if cell is None:
154
+ cleaned_cell = ""
155
+ else:
156
+ cleaned_cell = str(cell).strip()
157
+ cleaned_cell = re.sub(r'\s+', ' ', cleaned_cell)
158
+ cleaned_cell = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', cleaned_cell)
159
+ cleaned_row.append(cleaned_cell)
160
+ if any(cell.strip() for cell in cleaned_row):
161
+ cleaned.append(cleaned_row)
162
+
163
+ # Optional: collapse single-column tables of empty strings
164
+ if cleaned and all(len(r) == len(cleaned[0]) for r in cleaned):
165
+ return cleaned
166
+ return cleaned
167
+
168
+ def _extract_key_value_pairs_improved(self, text: str) -> Dict[str, str]:
169
+ """Improved key-value pair extraction with better cleaning"""
170
+ pairs: Dict[str, str] = {}
171
+
172
+ # Normalize text a bit for regex stability
173
+ t = text.replace('\r', '\n')
174
+
175
+ # Pattern 1: colon-separated pairs (key: value)
176
+ pattern1 = re.compile(
177
+ r'([A-Za-z][\w\s()/\-.]{2,80}?):\s*([^\n\r:][^\n\r]*)'
178
+ )
179
+ for key, val in pattern1.findall(t):
180
+ k = key.strip()
181
+ v = val.strip()
182
+ # Filter junk: very long values, pure separators, or obvious headers
183
+ if not v or len(v) > 200:
184
+ continue
185
+ if re.fullmatch(r'[-_/\.]+', v):
186
+ continue
187
+ # Avoid capturing the next key as value by trimming trailing key-like tokens
188
+ v = re.sub(r'\s+[A-Z][\w\s()/\-.]{2,40}:$', '', v).strip()
189
+ # Skip values that are just long digit runs (likely id lists without meaning)
190
+ if re.fullmatch(r'\d{6,}', v):
191
+ continue
192
+ pairs[k] = v
193
+
194
+ # Pattern 2: inline “Key – Value” or “Key — Value”
195
+ pattern2 = re.compile(r'([A-Za-z][\w\s()/\-.]{2,80}?)\s*[–—-]\s*([^\n\r]+)')
196
+ for key, val in pattern2.findall(t):
197
+ k = key.strip()
198
+ v = val.strip()
199
+ if v and len(v) <= 200 and not re.fullmatch(r'\d{6,}', v):
200
+ pairs.setdefault(k, v)
201
+
202
+ return pairs
203
+
204
+ def _extract_audit_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
205
+ """Extract audit-specific information with better filtering"""
206
+ audit_info: Dict[str, Any] = {}
207
+
208
+ # Prefer tables
209
+ for table in tables:
210
+ headers = [str(h).lower() for h in table.get("headers", [])]
211
+ joined = ' '.join(headers)
212
+ if "audit information" in joined or "auditinformation" in joined:
213
+ data = table.get("data", [])
214
+ for row in data:
215
+ if len(row) >= 2 and row[0] and row[1]:
216
+ key = str(row[0]).strip()
217
+ value = str(row[1]).strip()
218
+ # Skip numbered list rows (e.g., "1.", "2)")
219
+ if re.match(r'^\s*\d+\s*[.)]\s*$', key):
220
+ continue
221
+ if key and value:
222
+ audit_info[key] = value
223
+
224
+ # Backup from text
225
+ candidates = {
226
+ "Date of Audit": r'Date\s+of\s+Audit[:\s]*([^\n\r]+)',
227
+ "Location of audit": r'Location\s+of\s+audit[:\s]*([^\n\r]+)',
228
+ "Auditor name": r'Auditor\s+name[:\s]*([^\n\r]+)',
229
+ "Audit Matrix Identifier (Name or Number)": r'Audit\s+Matrix\s+Identifier.*?[:\s]*([^\n\r]+)',
230
+ }
231
+ for k, pat in candidates.items():
232
+ if k not in audit_info:
233
+ m = re.search(pat, text, re.IGNORECASE)
234
+ if m:
235
+ audit_info[k] = m.group(1).strip()
236
+
237
+ return audit_info
238
+
239
+ def _extract_operator_info(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
240
+ """Extract operator information with better table parsing"""
241
+ operator_info: Dict[str, Any] = {}
242
+
243
+ # Look for operator information in tables first
244
+ for table in tables:
245
+ headers = [str(h).lower() for h in table.get("headers", [])]
246
+ if ("operatorinformation" in ' '.join(headers) or
247
+ "operator information" in ' '.join(headers) or
248
+ "operatorcontactdetails" in ' '.join(headers)):
249
+
250
+ data = table.get("data", [])
251
+ for row in data:
252
+ if len(row) >= 2 and row[0] and row[1]:
253
+ key = str(row[0]).strip()
254
+ value = str(row[1]).strip()
255
+ if key and value:
256
+ # Clean up key names
257
+ kl = key.lower()
258
+ if "operator name" in kl:
259
+ operator_info["operator_name"] = value
260
+ elif "trading name" in kl:
261
+ operator_info["trading_name"] = value
262
+ elif "company number" in kl:
263
+ if len(row) > 2:
264
+ company_parts = [str(r).strip() for r in row[1:] if str(r).strip()]
265
+ operator_info["company_number"] = "".join(company_parts)
266
+ else:
267
+ operator_info["company_number"] = value
268
+ elif "business address" in kl:
269
+ operator_info["business_address"] = value
270
+ elif "postal address" in kl:
271
+ operator_info["postal_address"] = value
272
+ elif "email" in kl:
273
+ operator_info["email"] = value
274
+ elif "telephone" in kl or "phone" in kl:
275
+ operator_info["phone"] = value
276
+ elif "nhvas accreditation" in kl:
277
+ operator_info["nhvas_accreditation"] = value
278
+ elif "nhvas manual" in kl:
279
+ operator_info["nhvas_manual"] = value
280
+
281
+ # Extract from text patterns as backup
282
+ patterns = {
283
+ 'operator_name': r'Operator\s*name[:\s\(]*([^\n\r\)]+?)(?=\s*NHVAS|\s*Registered|$)',
284
+ 'trading_name': r'Registered\s*trading\s*name[:\s\/]*([^\n\r]+?)(?=\s*Australian|$)',
285
+ 'company_number': r'Australian\s*Company\s*Number[:\s]*([0-9\s]+?)(?=\s*NHVAS|$)',
286
+ 'business_address': r'Operator\s*business\s*address[:\s]*([^\n\r]+?)(?=\s*Operator\s*Postal|$)',
287
+ 'postal_address': r'Operator\s*Postal\s*address[:\s]*([^\n\r]+?)(?=\s*Email|$)',
288
+ 'email': r'Email\s*address[:\s]*([^\s\n\r]+)',
289
+ 'phone': r'Operator\s*Telephone\s*Number[:\s]*([^\s\n\r]+)',
290
+ 'nhvas_accreditation': r'NHVAS\s*Accreditation\s*No\.[:\s\(]*([^\n\r\)]+)',
291
+ }
292
+
293
+ for key, pattern in patterns.items():
294
+ if key not in operator_info: # Only use text if not found in tables
295
+ match = re.search(pattern, text, re.IGNORECASE)
296
+ if match:
297
+ value = match.group(1).strip()
298
+ if value and len(value) < 200:
299
+ if key == 'company_number':
300
+ value = re.sub(r'\s+', '', value)
301
+ operator_info[key] = value
302
+
303
+ return operator_info
304
+
305
+ def _extract_vehicle_registrations(self, tables: List[Dict]) -> List[Dict]:
306
+ """Extract vehicle registration information from tables"""
307
+ vehicles: List[Dict[str, Any]] = []
308
+
309
+ for table in tables:
310
+ headers = [str(h).lower() for h in table.get("headers", [])]
311
+
312
+ # Look for vehicle registration tables
313
+ if any(keyword in ' '.join(headers) for keyword in ['registration', 'vehicle', 'number']):
314
+ reg_col = None
315
+ for i, header in enumerate(headers):
316
+ if 'registration' in header and 'number' in header:
317
+ reg_col = i
318
+ break
319
+
320
+ if reg_col is not None:
321
+ data = table.get("data", [])
322
+ for row in data:
323
+ if len(row) > reg_col and row[reg_col]:
324
+ reg_num = str(row[reg_col]).strip()
325
+ # Validate registration format (letters/numbers)
326
+ if re.match(r'^[A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3}$', reg_num):
327
+ vehicle_info = {"registration_number": reg_num}
328
+
329
+ # Add other columns as additional info
330
+ for i, header in enumerate(table.get("headers", [])):
331
+ if i < len(row) and i != reg_col:
332
+ vehicle_info[str(header)] = str(row[i]).strip()
333
+
334
+ vehicles.append(vehicle_info)
335
+
336
+ return vehicles
337
+
338
+ def _extract_driver_records(self, tables: List[Dict]) -> List[Dict]:
339
+ """Extract driver records from tables"""
340
+ drivers: List[Dict[str, Any]] = []
341
+
342
+ for table in tables:
343
+ headers = [str(h).lower() for h in table.get("headers", [])]
344
+
345
+ # Look for driver/scheduler tables
346
+ if any(keyword in ' '.join(headers) for keyword in ['driver', 'scheduler', 'name']):
347
+ name_col = None
348
+ for i, header in enumerate(headers):
349
+ if 'name' in header:
350
+ name_col = i
351
+ break
352
+
353
+ if name_col is not None:
354
+ data = table.get("data", [])
355
+ for row in data:
356
+ if len(row) > name_col and row[name_col]:
357
+ name = str(row[name_col]).strip()
358
+ # Basic name validation
359
+ if re.match(r'^[A-Za-z\s]{2,}$', name) and len(name.split()) >= 2:
360
+ driver_info = {"name": name}
361
+
362
+ # Add other columns
363
+ for i, header in enumerate(table.get("headers", [])):
364
+ if i < len(row) and i != name_col:
365
+ driver_info[str(header)] = str(row[i]).strip()
366
+
367
+ drivers.append(driver_info)
368
+
369
+ return drivers
370
+
371
+ def _extract_compliance_summary(self, text: str, tables: List[Dict]) -> Dict[str, Any]:
372
+ """Extract compliance information"""
373
+ compliance = {
374
+ "standards_compliance": {},
375
+ "compliance_codes": {},
376
+ "audit_results": []
377
+ }
378
+
379
+ # Look for compliance tables
380
+ for table in tables:
381
+ headers = [str(h).lower() for h in table.get("headers", [])]
382
+
383
+ if any(keyword in ' '.join(headers) for keyword in ['compliance', 'standard', 'requirement']):
384
+ data = table.get("data", [])
385
+ for row in data:
386
+ if len(row) >= 2:
387
+ standard = str(row[0]).strip()
388
+ code = str(row[1]).strip()
389
+ if standard.startswith('Std') and code in ['V', 'NC', 'SFI', 'NAP', 'NA']:
390
+ compliance["standards_compliance"][standard] = code
391
+
392
+ # Extract compliance codes definitions
393
+ code_patterns = {
394
+ 'V': r'\bV\b\s+([^\n\r]+)',
395
+ 'NC': r'\bNC\b\s+([^\n\r]+)',
396
+ 'SFI': r'\bSFI\b\s+([^\n\r]+)',
397
+ 'NAP': r'\bNAP\b\s+([^\n\r]+)',
398
+ 'NA': r'\bNA\b\s+([^\n\r]+)',
399
+ }
400
+
401
+ for code, pattern in code_patterns.items():
402
+ match = re.search(pattern, text, re.IGNORECASE)
403
+ if match:
404
+ compliance["compliance_codes"][code] = match.group(1).strip()
405
+
406
+ return compliance
407
+
408
+ def _extract_dates_and_numbers_improved(self, text: str) -> Dict[str, Any]:
409
+ """Improved date and number extraction"""
410
+ result = {
411
+ "dates": [],
412
+ "registration_numbers": [],
413
+ "phone_numbers": [],
414
+ "email_addresses": [],
415
+ "reference_numbers": []
416
+ }
417
+
418
+ # Date patterns
419
+ date_patterns = [
420
+ r'\b(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})\b',
421
+ r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
422
+ r'\b(\d{1,2}-\d{1,2}-\d{4})\b',
423
+ r'\b(\d{1,2}\.\d{1,2}\.\d{4})\b',
424
+ ]
425
+ for pattern in date_patterns:
426
+ result["dates"].extend(re.findall(pattern, text))
427
+
428
+ # Registration numbers (Australian format-ish)
429
+ reg_pattern = r'\b([A-Z]{1,3}\s*\d{1,3}\s*[A-Z]{0,3})\b'
430
+ result["registration_numbers"] = list(set(re.findall(reg_pattern, text)))
431
+
432
+ # Phone numbers (AU)
433
+ phone_pattern = r'\b((?:\+61|0)[2-9]\s?\d{4}\s?\d{4})\b'
434
+ result["phone_numbers"] = list(set(re.findall(phone_pattern, text)))
435
+
436
+ # Email addresses
437
+ email_pattern = r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b'
438
+ result["email_addresses"] = list(set(re.findall(email_pattern, text)))
439
+
440
+ # Reference numbers
441
+ ref_patterns = [
442
+ (r'RF(?:S)?\s*#?\s*(\d+)', 'RFS_Certifications'),
443
+ (r'NHVAS\s+Accreditation\s+No\.?\s*(\d+)', 'NHVAS_Numbers'),
444
+ (r'Registration\s+Number\s*#?\s*(\d+)', 'Registration_Numbers'),
445
+ ]
446
+ for pattern, key in ref_patterns:
447
+ matches = re.findall(pattern, text, re.IGNORECASE)
448
+ if matches:
449
+ result["reference_numbers"].extend([f"{key}: {m}" for m in matches])
450
+
451
+ return result
452
+
453
+ @staticmethod
454
+ def save_results(results: Dict[str, Any], output_path: str):
455
+ """Save results to JSON file"""
456
+ try:
457
+ with open(output_path, 'w', encoding='utf-8') as f:
458
+ json.dump(results, f, indent=2, ensure_ascii=False)
459
+ logger.info(f"💾 Results saved to {output_path}")
460
+ except Exception as e:
461
+ logger.error(f"Failed to save results: {e}")
462
+
463
+ @staticmethod
464
+ def export_to_excel(results: Dict[str, Any], excel_path: str):
465
+ """Export results to Excel with improved formatting"""
466
+ try:
467
+ with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
468
+ # Summary sheet
469
+ summary_data = []
470
+ extraction_summary = results.get("extraction_summary", {})
471
+ for key, value in extraction_summary.items():
472
+ summary_data.append({"Metric": key.replace("_", " ").title(), "Value": value})
473
+ pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
474
+
475
+ # Key-value pairs
476
+ kv_pairs = results.get("extracted_data", {}).get("key_value_pairs", {})
477
+ if kv_pairs:
478
+ kv_df = pd.DataFrame(list(kv_pairs.items()), columns=['Key', 'Value'])
479
+ kv_df.to_excel(writer, sheet_name='Key_Value_Pairs', index=False)
480
+
481
+ # Vehicle registrations
482
+ vehicles = results.get("extracted_data", {}).get("vehicle_registrations", [])
483
+ if vehicles:
484
+ pd.DataFrame(vehicles).to_excel(writer, sheet_name='Vehicle_Registrations', index=False)
485
+
486
+ # Driver records
487
+ drivers = results.get("extracted_data", {}).get("driver_records", [])
488
+ if drivers:
489
+ pd.DataFrame(drivers).to_excel(writer, sheet_name='Driver_Records', index=False)
490
+
491
+ # Compliance summary
492
+ compliance = results.get("extracted_data", {}).get("compliance_summary", {})
493
+ if compliance.get("standards_compliance"):
494
+ comp_df = pd.DataFrame(list(compliance["standards_compliance"].items()),
495
+ columns=['Standard', 'Compliance_Code'])
496
+ comp_df.to_excel(writer, sheet_name='Compliance_Standards', index=False)
497
+
498
+ logger.info(f"📊 Results exported to Excel: {excel_path}")
499
+ except Exception as e:
500
+ logger.error(f"Failed to export to Excel: {e}")
501
+
502
+ def main():
503
+ if len(sys.argv) < 2:
504
+ print("Usage: python fixed_pdf_extractor.py <pdf_path>")
505
+ sys.exit(1)
506
+
507
+ pdf_path = Path(sys.argv[1])
508
+ if not pdf_path.exists():
509
+ print(f"❌ PDF not found: {pdf_path}")
510
+ sys.exit(1)
511
+
512
+ print("🚀 Fixed PDF Data Extractor")
513
+ print("=" * 50)
514
+
515
+ extractor = FixedPDFExtractor()
516
+ results = extractor.extract_everything(str(pdf_path))
517
+
518
+ base = pdf_path.stem
519
+ output_dir = pdf_path.parent
520
+
521
+ # Save outputs
522
+ json_path = output_dir / f"{base}_comprehensive_data.json"
523
+ excel_path = output_dir / f"{base}_fixed_extraction.xlsx"
524
+
525
+ FixedPDFExtractor.save_results(results, str(json_path))
526
+ FixedPDFExtractor.export_to_excel(results, str(excel_path))
527
+
528
+ print("\n💾 OUTPUT FILES:")
529
+ print(f" 📄 JSON Data: {json_path}")
530
+ print(f" 📊 Excel Data: {excel_path}")
531
+ print(f"\n✨ FIXED EXTRACTION COMPLETE!")
532
+
533
+ if __name__ == "__main__":
534
+ main()
space-pdf/extract_red_text.py ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import re
3
+ import json
4
+ import sys
5
+ from docx import Document
6
+ from docx.oxml.ns import qn
7
+ from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
+
9
+ def normalize_header_label(s: str) -> str:
10
+ """Normalize a header/label by stripping parentheticals & punctuation."""
11
+ s = re.sub(r"\s+", " ", s.strip())
12
+ # remove content in parentheses/brackets
13
+ s = re.sub(r"\([^)]*\)", "", s)
14
+ s = re.sub(r"\[[^]]*\]", "", s)
15
+ # unify slashes and hyphens, collapse spaces
16
+ s = s.replace("–", "-").replace("—", "-").replace("/", " / ").replace(" ", " ")
17
+ return s.strip()
18
+
19
+ # Canonical label aliases for Vehicle/Maintenance/General headers
20
+ LABEL_ALIASES = {
21
+ # Vehicle Registration (Maintenance)
22
+ "roadworthiness certificates": "Roadworthiness Certificates",
23
+ "maintenance records": "Maintenance Records",
24
+ "daily checks": "Daily Checks",
25
+ "fault recording / reporting": "Fault Recording/ Reporting",
26
+ "fault repair": "Fault Repair",
27
+
28
+ # Vehicle Registration (Mass)
29
+ "sub contracted vehicles statement of compliance": "Sub-contracted Vehicles Statement of Compliance",
30
+ "weight verification records": "Weight Verification Records",
31
+ "rfs suspension certification #": "RFS Suspension Certification #",
32
+ "suspension system maintenance": "Suspension System Maintenance",
33
+ "trip records": "Trip Records",
34
+ "fault recording/ reporting on suspension system": "Fault Recording/ Reporting on Suspension System",
35
+
36
+ # Common
37
+ "registration number": "Registration Number",
38
+ "no.": "No.",
39
+ "sub contractor": "Sub contractor",
40
+ "sub-contractor": "Sub contractor",
41
+ }
42
+
43
+ def looks_like_operator_declaration(context):
44
+ """True iff heading says Operator Declaration and headers include Print Name + Position Title."""
45
+ heading = (context.get("heading") or "").strip().lower()
46
+ headers = " ".join(context.get("headers") or []).lower()
47
+ return (
48
+ "operator declaration" in heading
49
+ and "print name" in headers
50
+ and "position" in headers
51
+ and "title" in headers
52
+ )
53
+
54
+ def looks_like_auditor_declaration(context):
55
+ heading = (context.get("heading") or "").strip().lower()
56
+ headers = " ".join(context.get("headers") or []).lower()
57
+ return (
58
+ "auditor declaration" in heading
59
+ and "print name" in headers
60
+ and ("nhvr" in headers or "auditor registration number" in headers)
61
+ )
62
+
63
+ # --- NEW: header-only fallback that ignores headings and just keys on the two column names
64
+ def extract_operator_declaration_by_headers_from_end(doc):
65
+ """
66
+ Scan tables from the end; if a table's first row contains both
67
+ 'Print Name' AND 'Position Title' (case-insensitive), extract red text
68
+ from the data rows into:
69
+ {"Print Name": [...], "Position Title": [...]}
70
+ """
71
+ for tbl in reversed(doc.tables):
72
+ if len(tbl.rows) < 2:
73
+ continue # need header + at least one data row
74
+
75
+ headers_norm = [normalize_header_label(c.text).lower() for c in tbl.rows[0].cells]
76
+ has_print = any("print name" in h for h in headers_norm)
77
+ has_pos_tit = any(("position title" in h) or ("position" in h and "title" in h) for h in headers_norm)
78
+ if not (has_print and has_pos_tit):
79
+ continue
80
+
81
+ idx_print = next((i for i, h in enumerate(headers_norm) if "print name" in h), None)
82
+ idx_pos = next((i for i, h in enumerate(headers_norm) if "position title" in h), None)
83
+ if idx_pos is None:
84
+ idx_pos = next((i for i, h in enumerate(headers_norm) if ("position" in h and "title" in h)), None)
85
+
86
+ result = {"Print Name": [], "Position Title": []}
87
+ for row in tbl.rows[1:]:
88
+ if idx_print is not None and idx_print < len(row.cells):
89
+ cell = row.cells[idx_print]
90
+ reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
91
+ reds = coalesce_numeric_runs(reds)
92
+ txt = normalize_text(" ".join(reds))
93
+ if txt:
94
+ result["Print Name"].append(txt)
95
+
96
+ if idx_pos is not None and idx_pos < len(row.cells):
97
+ cell = row.cells[idx_pos]
98
+ reds = [r.text for p in cell.paragraphs for r in p.runs if is_red_font(r) and r.text]
99
+ reds = coalesce_numeric_runs(reds)
100
+ txt = normalize_text(" ".join(reds))
101
+ if txt:
102
+ result["Position Title"].append(txt)
103
+
104
+ if result["Print Name"] or result["Position Title"]:
105
+ return {k: v for k, v in result.items() if v}
106
+
107
+ return None
108
+ # --- end NEW helper
109
+
110
+ def canonicalize_label(s: str) -> str:
111
+ key = normalize_header_label(s).lower()
112
+ key = re.sub(r"\s+", " ", key)
113
+ return LABEL_ALIASES.get(key, s)
114
+
115
+ def bag_similarity(a: str, b: str) -> float:
116
+ """Loose bag-of-words similarity for header↔label matching."""
117
+ aw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(a).lower()) if len(w) > 2 or w in {"#","no"}}
118
+ bw = {w for w in re.split(r"[^A-Za-z0-9#]+", normalize_header_label(b).lower()) if len(w) > 2 or w in {"#","no"}}
119
+ if not aw or not bw:
120
+ return 0.0
121
+ inter = len(aw & bw)
122
+ return inter / max(len(aw), len(bw))
123
+
124
+ def coalesce_numeric_runs(text_list):
125
+ """
126
+ If a cell yields ['4','5','6','9','8','7','1','2','3'] etc., join continuous single-char digit runs.
127
+ Returns ['456987123'] instead of many singles. Non-digit tokens are preserved.
128
+ """
129
+ out, buf = [], []
130
+ for t in text_list:
131
+ if len(t) == 1 and t.isdigit():
132
+ buf.append(t)
133
+ else:
134
+ if buf:
135
+ out.append("".join(buf))
136
+ buf = []
137
+ out.append(t)
138
+ if buf:
139
+ out.append("".join(buf))
140
+ return out
141
+
142
+ def is_red_font(run):
143
+ """Enhanced red font detection with better color checking"""
144
+ col = run.font.color
145
+ if col and col.rgb:
146
+ r, g, b = col.rgb
147
+ if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
148
+ return True
149
+ rPr = getattr(run._element, "rPr", None)
150
+ if rPr is not None:
151
+ clr = rPr.find(qn('w:color'))
152
+ if clr is not None:
153
+ val = clr.get(qn('w:val'))
154
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
155
+ rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
156
+ if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
157
+ return True
158
+ return False
159
+
160
+ def _prev_para_text(tbl):
161
+ """Get text from previous paragraph before table"""
162
+ prev = tbl._tbl.getprevious()
163
+ while prev is not None and not prev.tag.endswith("}p"):
164
+ prev = prev.getprevious()
165
+ if prev is None:
166
+ return ""
167
+ return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
168
+
169
+ def normalize_text(text):
170
+ """Normalize text for better matching"""
171
+ return re.sub(r'\s+', ' ', text.strip())
172
+
173
+ def fuzzy_match_heading(heading, patterns):
174
+ """Check if heading matches any pattern with fuzzy matching"""
175
+ heading_norm = normalize_text(heading.upper())
176
+ for pattern in patterns:
177
+ if re.search(pattern, heading_norm, re.IGNORECASE):
178
+ return True
179
+ return False
180
+
181
+ def get_table_context(tbl):
182
+ """Get comprehensive context information for table"""
183
+ heading = normalize_text(_prev_para_text(tbl))
184
+ headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
185
+ col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
186
+ first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
187
+ all_cells = []
188
+ for row in tbl.rows:
189
+ for cell in row.cells:
190
+ text = normalize_text(cell.text)
191
+ if text:
192
+ all_cells.append(text)
193
+ return {
194
+ 'heading': heading,
195
+ 'headers': headers,
196
+ 'col0': col0,
197
+ 'first_cell': first_cell,
198
+ 'all_cells': all_cells,
199
+ 'num_rows': len(tbl.rows),
200
+ 'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
201
+ }
202
+
203
+ def calculate_schema_match_score(schema_name, spec, context):
204
+ """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
205
+ score = 0
206
+ reasons = []
207
+
208
+ # 🎯 VEHICLE REGISTRATION BOOST
209
+ if "Vehicle Registration" in schema_name:
210
+ vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
211
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
212
+ keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
213
+ if keyword_matches >= 2:
214
+ score += 150 # Very high boost for vehicle tables
215
+ reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
216
+ elif keyword_matches >= 1:
217
+ score += 75 # Medium boost
218
+ reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
219
+
220
+ # 🎯 SUMMARY TABLE BOOST (existing logic)
221
+ if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
222
+ score += 100
223
+ reasons.append(f"Summary schema with DETAILS column - perfect match")
224
+
225
+ if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
226
+ score -= 75
227
+ reasons.append(f"Non-summary schema penalized for DETAILS column presence")
228
+
229
+ # Context exclusions
230
+ if spec.get("context_exclusions"):
231
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
232
+ for exclusion in spec["context_exclusions"]:
233
+ if exclusion.lower() in table_text:
234
+ score -= 50
235
+ reasons.append(f"Context exclusion penalty: '{exclusion}' found")
236
+
237
+ # Context keywords
238
+ if spec.get("context_keywords"):
239
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
240
+ keyword_matches = 0
241
+ for keyword in spec["context_keywords"]:
242
+ if keyword.lower() in table_text:
243
+ keyword_matches += 1
244
+
245
+ if keyword_matches > 0:
246
+ score += keyword_matches * 15
247
+ reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
248
+
249
+ # Direct first cell match
250
+ if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
251
+ score += 100
252
+ reasons.append(f"Direct first cell match: '{context['first_cell']}'")
253
+
254
+ # Heading pattern matching
255
+ if spec.get("headings"):
256
+ for h in spec["headings"]:
257
+ if fuzzy_match_heading(context['heading'], [h["text"]]):
258
+ score += 50
259
+ reasons.append(f"Heading match: '{context['heading']}'")
260
+ break
261
+
262
+ # Column header matching
263
+ if spec.get("columns"):
264
+ cols = [normalize_text(col) for col in spec["columns"]]
265
+ matches = 0
266
+ for col in cols:
267
+ if any(col.upper() in h.upper() for h in context['headers']):
268
+ matches += 1
269
+ if matches == len(cols):
270
+ score += 60
271
+ reasons.append(f"All column headers match: {cols}")
272
+ elif matches > 0:
273
+ score += matches * 20
274
+ reasons.append(f"Partial column matches: {matches}/{len(cols)}")
275
+
276
+ # Label matching for left-oriented tables
277
+ if spec.get("orientation") == "left":
278
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
279
+ matches = 0
280
+ for lbl in labels:
281
+ if any(lbl.upper() in c.upper() or c.upper() in lbl.upper() for c in context['col0']):
282
+ matches += 1
283
+ if matches > 0:
284
+ score += (matches / len(labels)) * 30
285
+ reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
286
+
287
+ # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
288
+ elif spec.get("orientation") == "row1":
289
+ labels = [normalize_text(lbl) for lbl in spec["labels"]]
290
+ matches = 0
291
+ for lbl in labels:
292
+ if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
293
+ matches += 1
294
+ elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
295
+ matches += 0.5 # Partial credit
296
+ if matches > 0:
297
+ score += (matches / len(labels)) * 40
298
+ reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
299
+
300
+ # Special handling for Declaration tables (existing logic)
301
+ if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
302
+ if "OPERATOR DECLARATION" in context['heading'].upper():
303
+ score += 80
304
+ reasons.append("Operator Declaration context match")
305
+ elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
306
+ score += 60
307
+ reasons.append("Manager found in cells (likely Operator Declaration)")
308
+
309
+ if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
310
+ if any("MANAGER" in cell.upper() for cell in context['all_cells']):
311
+ score -= 50
312
+ reasons.append("Penalty: Manager found (not auditor)")
313
+
314
+ return score, reasons
315
+
316
+ def match_table_schema(tbl):
317
+ """Improved table schema matching with scoring system"""
318
+ context = get_table_context(tbl)
319
+ # Auditor Declaration first
320
+ if ("print name" in " ".join(context.get("headers", [])).lower() and
321
+ "auditor" in " ".join(context.get("headers", [])).lower()):
322
+ return "NHVAS Approved Auditor Declaration"
323
+ # NEW: prioritize Auditor Declaration to avoid misclassification
324
+ if looks_like_auditor_declaration(context):
325
+ return "NHVAS Approved Auditor Declaration"
326
+ # hard-match Operator Declaration first (high priority, avoids misclassification)
327
+ if looks_like_operator_declaration(context):
328
+ return "Operator Declaration"
329
+ best_match = None
330
+ best_score = 0
331
+ for name, spec in TABLE_SCHEMAS.items():
332
+ score, reasons = calculate_schema_match_score(name, spec, context)
333
+ if score > best_score:
334
+ best_score = score
335
+ best_match = name
336
+ if best_score >= 20:
337
+ return best_match
338
+ return None
339
+
340
+ def check_multi_schema_table(tbl):
341
+ """Check if table contains multiple schemas and split appropriately"""
342
+ context = get_table_context(tbl)
343
+ operator_labels = ["Operator name (Legal entity)", "NHVAS Accreditation No.", "Registered trading name/s",
344
+ "Australian Company Number", "NHVAS Manual"]
345
+ contact_labels = ["Operator business address", "Operator Postal address", "Email address", "Operator Telephone Number"]
346
+ has_operator = any(any(op_lbl.upper() in cell.upper() for op_lbl in operator_labels) for cell in context['col0'])
347
+ has_contact = any(any(cont_lbl.upper() in cell.upper() for cont_lbl in contact_labels) for cell in context['col0'])
348
+ if has_operator and has_contact:
349
+ return ["Operator Information", "Operator contact details"]
350
+ return None
351
+
352
+ def extract_multi_schema_table(tbl, schemas):
353
+ """Extract data from table with multiple schemas"""
354
+ result = {}
355
+ for schema_name in schemas:
356
+ if schema_name not in TABLE_SCHEMAS:
357
+ continue
358
+ spec = TABLE_SCHEMAS[schema_name]
359
+ schema_data = {}
360
+ for ri, row in enumerate(tbl.rows):
361
+ if ri == 0:
362
+ continue
363
+ row_label = normalize_text(row.cells[0].text)
364
+ belongs_to_schema = False
365
+ matched_label = None
366
+ for spec_label in spec["labels"]:
367
+ spec_norm = normalize_text(spec_label).upper()
368
+ row_norm = row_label.upper()
369
+ if spec_norm == row_norm or spec_norm in row_norm or row_norm in spec_norm:
370
+ belongs_to_schema = True
371
+ matched_label = spec_label
372
+ break
373
+ if not belongs_to_schema:
374
+ continue
375
+ for ci, cell in enumerate(row.cells):
376
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
377
+ if red_txt:
378
+ if matched_label not in schema_data:
379
+ schema_data[matched_label] = []
380
+ if red_txt not in schema_data[matched_label]:
381
+ schema_data[matched_label].append(red_txt)
382
+ if schema_data:
383
+ result[schema_name] = schema_data
384
+ return result
385
+
386
+ def extract_table_data(tbl, schema_name, spec):
387
+ """Extract red text data from table based on schema – per-row repeats for specific tables."""
388
+
389
+ # ───────────────────────────────────────────────────────────────────────────
390
+ # OPERATOR DECLARATION (row1 headers: Print Name | Position Title)
391
+ # ───────────────────────────────────────────────────────────────────────────
392
+ if schema_name == "Operator Declaration":
393
+ print(f" 🧾 EXTRACTION FIX: Processing Operator Declaration table")
394
+
395
+ labels = spec["labels"] # ["Print Name", "Position Title"]
396
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
397
+
398
+ collected = {lbl: [] for lbl in labels}
399
+
400
+ if len(tbl.rows) < 2:
401
+ print(f" ❌ Operator Declaration table has less than 2 rows")
402
+ return {}
403
+
404
+ # map header cells → labels (row1 orientation)
405
+ header_row = tbl.rows[0]
406
+ column_mapping = {}
407
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
408
+
409
+ for col_idx, cell in enumerate(header_row.cells):
410
+ raw_h = normalize_text(cell.text)
411
+ header_text = normalize_header_label(raw_h)
412
+ if not header_text:
413
+ continue
414
+ print(f" Column {col_idx}: '{raw_h}'")
415
+
416
+ # alias/canonical first
417
+ canon = canonicalize_label(header_text)
418
+ if canon in canonical_labels:
419
+ best_label = canonical_labels[canon]
420
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
421
+ column_mapping[col_idx] = best_label
422
+ continue
423
+
424
+ # else bag-of-words similarity
425
+ best_label, best_score = None, 0.0
426
+ for canon_lab, original_lab in canonical_labels.items():
427
+ s = bag_similarity(header_text, canon_lab)
428
+ if s > best_score:
429
+ best_score, best_label = s, original_lab
430
+
431
+ if best_label and best_score >= 0.40:
432
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
433
+ column_mapping[col_idx] = best_label
434
+ else:
435
+ print(f" ⚠️ No mapping found for '{raw_h}'")
436
+
437
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
438
+
439
+ # collect red text from the (usually single) data row
440
+ for row_idx in range(1, len(tbl.rows)):
441
+ row = tbl.rows[row_idx]
442
+ print(f" 📌 Processing data row {row_idx}")
443
+ for col_idx, cell in enumerate(row.cells):
444
+ if col_idx not in column_mapping:
445
+ continue
446
+ label = column_mapping[col_idx]
447
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
448
+ if not reds:
449
+ continue
450
+ reds = coalesce_numeric_runs(reds)
451
+ red_txt = normalize_text(" ".join(reds))
452
+ if not red_txt:
453
+ continue
454
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
455
+ collected[label].append(red_txt)
456
+
457
+ result = {k: v for k, v in collected.items() if v}
458
+ print(f" ✅ Operator Declaration extracted: {len(result)} columns with data")
459
+ return result
460
+
461
+ # ───────────────────────────────────────────────────────────────────────────
462
+ # A) Vehicle Registration tables (per-row accumulation; NO dedupe)
463
+ # ───────────────────────────────────────────────────────────────────────────
464
+ if "Vehicle Registration" in schema_name:
465
+ print(f" 🚗 EXTRACTION FIX: Processing Vehicle Registration table")
466
+
467
+ labels = spec["labels"]
468
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
469
+
470
+ collected = {lbl: [] for lbl in labels} # ← keep every row value
471
+ unmapped_bucket = {}
472
+
473
+ if len(tbl.rows) < 2:
474
+ print(f" ❌ Vehicle table has less than 2 rows")
475
+ return {}
476
+
477
+ header_row = tbl.rows[0]
478
+ column_mapping = {}
479
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
480
+
481
+ for col_idx, cell in enumerate(header_row.cells):
482
+ raw_h = normalize_text(cell.text)
483
+ header_text = normalize_header_label(raw_h)
484
+ if not header_text:
485
+ continue
486
+ print(f" Column {col_idx}: '{raw_h}'")
487
+
488
+ # Try alias/canonical first
489
+ canon = canonicalize_label(header_text)
490
+ if canon in canonical_labels:
491
+ best_label = canonical_labels[canon]
492
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
493
+ column_mapping[col_idx] = best_label
494
+ continue
495
+
496
+ # Else bag-of-words similarity
497
+ best_label, best_score = None, 0.0
498
+ for canon_lab, original_lab in canonical_labels.items():
499
+ s = bag_similarity(header_text, canon_lab)
500
+ if s > best_score:
501
+ best_score, best_label = s, original_lab
502
+
503
+ if best_label and best_score >= 0.40:
504
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
505
+ column_mapping[col_idx] = best_label
506
+ else:
507
+ print(f" ⚠️ No mapping found for '{raw_h}'")
508
+ unmapped_bucket[raw_h] = []
509
+
510
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
511
+
512
+ header_texts = [normalize_text(hc.text) for hc in header_row.cells]
513
+ for row_idx in range(1, len(tbl.rows)):
514
+ row = tbl.rows[row_idx]
515
+ print(f" 📌 Processing data row {row_idx}")
516
+ for col_idx, cell in enumerate(row.cells):
517
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
518
+ if not reds:
519
+ continue
520
+ reds = coalesce_numeric_runs(reds)
521
+ red_txt = normalize_text(" ".join(reds))
522
+ if not red_txt:
523
+ continue
524
+
525
+ if col_idx in column_mapping:
526
+ label = column_mapping[col_idx]
527
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
528
+ collected[label].append(red_txt) # ← append every occurrence
529
+ else:
530
+ header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
531
+ unmapped_bucket.setdefault(header_name, []).append(red_txt)
532
+
533
+ result = {k: v for k, v in collected.items() if v}
534
+ if unmapped_bucket:
535
+ result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
536
+ print(f" ✅ Vehicle Registration extracted: {len(result)} columns with data")
537
+ return result
538
+
539
+ # ───────────────────────────────────────────────────────────────────────────
540
+ # B) Driver / Scheduler Records Examined (per-row accumulation; NO dedupe)
541
+ # ───────────────────────────────────────────────────────────────────────────
542
+ if "Driver / Scheduler" in schema_name:
543
+ print(f" 👤 EXTRACTION FIX: Processing Driver / Scheduler table")
544
+
545
+ labels = spec["labels"]
546
+ canonical_labels = {canonicalize_label(lbl): lbl for lbl in labels}
547
+
548
+ collected = {lbl: [] for lbl in labels} # ← keep every row value
549
+ unmapped_bucket = {}
550
+
551
+ if len(tbl.rows) < 2:
552
+ print(f" ❌ Driver/Scheduler table has less than 2 rows")
553
+ return {}
554
+
555
+ header_row = tbl.rows[0]
556
+ column_mapping = {}
557
+ print(f" 📋 Mapping {len(header_row.cells)} header cells to labels")
558
+
559
+ for col_idx, cell in enumerate(header_row.cells):
560
+ raw_h = normalize_text(cell.text)
561
+ header_text = normalize_header_label(raw_h)
562
+ if not header_text:
563
+ continue
564
+ print(f" Column {col_idx}: '{raw_h}'")
565
+
566
+ # Try alias/canonical first (rarely used here, but safe)
567
+ canon = canonicalize_label(header_text)
568
+ if canon in canonical_labels:
569
+ best_label = canonical_labels[canon]
570
+ print(f" ✅ Mapped to: '{best_label}' (alias)")
571
+ column_mapping[col_idx] = best_label
572
+ continue
573
+
574
+ # Else bag-of-words similarity (good for long headings)
575
+ best_label, best_score = None, 0.0
576
+ for canon_lab, original_lab in canonical_labels.items():
577
+ s = bag_similarity(header_text, canon_lab)
578
+ if s > best_score:
579
+ best_score, best_label = s, original_lab
580
+
581
+ if best_label and best_score >= 0.40:
582
+ print(f" ✅ Mapped to: '{best_label}' (score: {best_score:.2f})")
583
+ column_mapping[col_idx] = best_label
584
+ else:
585
+ print(f" ⚠️ No mapping found for '{raw_h}'")
586
+ unmapped_bucket[raw_h] = []
587
+
588
+ print(f" 📊 Total column mappings: {len(column_mapping)}")
589
+
590
+ header_texts = [normalize_text(hc.text) for hc in header_row.cells]
591
+ for row_idx in range(1, len(tbl.rows)):
592
+ row = tbl.rows[row_idx]
593
+ print(f" 📌 Processing data row {row_idx}")
594
+ for col_idx, cell in enumerate(row.cells):
595
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
596
+ if not reds:
597
+ continue
598
+ reds = coalesce_numeric_runs(reds)
599
+ red_txt = normalize_text(" ".join(reds))
600
+ if not red_txt:
601
+ continue
602
+
603
+ if col_idx in column_mapping:
604
+ label = column_mapping[col_idx]
605
+ print(f" 🔴 Found red text in '{label}': '{red_txt}'")
606
+ collected[label].append(red_txt) # ← append every occurrence
607
+ else:
608
+ header_name = header_texts[col_idx] if col_idx < len(header_texts) else f"(unmapped col {col_idx})"
609
+ unmapped_bucket.setdefault(header_name, []).append(red_txt)
610
+
611
+ result = {k: v for k, v in collected.items() if v}
612
+ if unmapped_bucket:
613
+ result.update({f"UNMAPPED::{k}": v for k, v in unmapped_bucket.items() if v})
614
+ print(f" ✅ Driver / Scheduler extracted: {len(result)} columns with data")
615
+ return result
616
+
617
+ # ───────────────────────────────────────────────────────────────────────────
618
+ # C) Generic tables (unchanged: WITH dedupe)
619
+ # ───────────────────────────────────────────────────────────────────────────
620
+ labels = spec["labels"] + [schema_name]
621
+ collected = {lbl: [] for lbl in labels}
622
+ seen = {lbl: set() for lbl in labels}
623
+ by_col = (spec.get("orientation") == "row1")
624
+ start_row = 1 if by_col else 0
625
+ rows = tbl.rows[start_row:]
626
+
627
+ for ri, row in enumerate(rows):
628
+ for ci, cell in enumerate(row.cells):
629
+ reds = [run.text for p in cell.paragraphs for run in p.runs if is_red_font(run) and run.text]
630
+ if not reds:
631
+ continue
632
+ reds = coalesce_numeric_runs(reds)
633
+ red_txt = normalize_text(" ".join(reds))
634
+ if not red_txt:
635
+ continue
636
+
637
+ if by_col:
638
+ if ci < len(spec["labels"]):
639
+ lbl = spec["labels"][ci]
640
+ else:
641
+ lbl = schema_name
642
+ else:
643
+ raw_label = normalize_text(row.cells[0].text)
644
+ lbl = None
645
+ for spec_label in spec["labels"]:
646
+ if normalize_text(spec_label).upper() == raw_label.upper():
647
+ lbl = spec_label
648
+ break
649
+ if not lbl:
650
+ a_raw = normalize_header_label(raw_label).upper()
651
+ for spec_label in spec["labels"]:
652
+ a_spec = normalize_header_label(spec_label).upper()
653
+ if a_spec in a_raw or a_raw in a_spec:
654
+ lbl = spec_label
655
+ break
656
+ if not lbl:
657
+ lbl = schema_name
658
+
659
+ if red_txt not in seen[lbl]:
660
+ seen[lbl].add(red_txt)
661
+ collected[lbl].append(red_txt)
662
+
663
+ return {k: v for k, v in collected.items() if v}
664
+
665
+ def extract_red_text(input_doc):
666
+ # input_doc: docx.Document object or file path
667
+ if isinstance(input_doc, str):
668
+ doc = Document(input_doc)
669
+ else:
670
+ doc = input_doc
671
+ out = {}
672
+ table_count = 0
673
+ for tbl in doc.tables:
674
+ table_count += 1
675
+ multi_schemas = check_multi_schema_table(tbl)
676
+ if multi_schemas:
677
+ multi_data = extract_multi_schema_table(tbl, multi_schemas)
678
+ for schema_name, schema_data in multi_data.items():
679
+ if schema_data:
680
+ if schema_name in out:
681
+ for k, v in schema_data.items():
682
+ if k in out[schema_name]:
683
+ out[schema_name][k].extend(v)
684
+ else:
685
+ out[schema_name][k] = v
686
+ else:
687
+ out[schema_name] = schema_data
688
+ continue
689
+ schema = match_table_schema(tbl)
690
+ if not schema:
691
+ continue
692
+ spec = TABLE_SCHEMAS[schema]
693
+ data = extract_table_data(tbl, schema, spec)
694
+ if data:
695
+ if schema in out:
696
+ for k, v in data.items():
697
+ if k in out[schema]:
698
+ out[schema][k].extend(v)
699
+ else:
700
+ out[schema][k] = v
701
+ else:
702
+ out[schema] = data
703
+
704
+ # paragraphs (FIX: do not return early; build full 'paras' then attach)
705
+ paras = {}
706
+ for idx, para in enumerate(doc.paragraphs):
707
+ red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
708
+ if not red_txt:
709
+ continue
710
+ context = None
711
+ for j in range(idx-1, -1, -1):
712
+ txt = normalize_text(doc.paragraphs[j].text)
713
+ if txt:
714
+ all_patterns = HEADING_PATTERNS["main"] + HEADING_PATTERNS["sub"]
715
+ if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
716
+ context = txt
717
+ break
718
+ if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
719
+ context = "Date"
720
+ if not context:
721
+ context = "(para)"
722
+ paras.setdefault(context, []).append(red_txt)
723
+
724
+ if paras:
725
+ out["paragraphs"] = paras
726
+
727
+ # Fallback: ensure we capture the last-page Operator Declaration by headers
728
+ if "Operator Declaration" not in out:
729
+ op_dec = extract_operator_declaration_by_headers_from_end(doc)
730
+ if op_dec:
731
+ out["Operator Declaration"] = op_dec
732
+
733
+ return out
734
+
735
+ def extract_red_text_filelike(input_file, output_file):
736
+ """
737
+ Accepts:
738
+ input_file: file-like object (BytesIO/File) or path
739
+ output_file: file-like object (opened for writing text) or path
740
+ """
741
+ if hasattr(input_file, "seek"):
742
+ input_file.seek(0)
743
+ doc = Document(input_file)
744
+ result = extract_red_text(doc)
745
+ if hasattr(output_file, "write"):
746
+ json.dump(result, output_file, indent=2, ensure_ascii=False)
747
+ output_file.flush()
748
+ else:
749
+ with open(output_file, "w", encoding="utf-8") as f:
750
+ json.dump(result, f, indent=2, ensure_ascii=False)
751
+ return result
752
+
753
+ if __name__ == "__main__":
754
+ # Support both script and app/file-like usage
755
+ if len(sys.argv) == 3:
756
+ input_docx = sys.argv[1]
757
+ output_json = sys.argv[2]
758
+ doc = Document(input_docx)
759
+ word_data = extract_red_text(doc)
760
+ with open(output_json, 'w', encoding='utf-8') as f:
761
+ json.dump(word_data, f, indent=2, ensure_ascii=False)
762
+ print(json.dumps(word_data, indent=2, ensure_ascii=False))
763
+ else:
764
+ print("To use as a module: extract_red_text_filelike(input_file, output_file)")
space-pdf/master_key.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Improved Master Key for NHVAS Audit extraction:
3
+ - TABLE_SCHEMAS: Enhanced definitions with better matching criteria for Summary vs Basic tables
4
+ - HEADING_PATTERNS: Improved regex patterns for main/sub headings
5
+ - PARAGRAPH_PATTERNS: Enhanced patterns for key narrative sections
6
+ """
7
+
8
+ # 1. Enhanced table schemas with better matching logic
9
+ TABLE_SCHEMAS = {
10
+ "Tick as appropriate": {
11
+ "headings": [
12
+ {"level": 1, "text": "NHVAS Audit Summary Report"},
13
+ ],
14
+ "orientation": "left",
15
+ "labels": [
16
+ "Mass",
17
+ "Entry Audit",
18
+ "Maintenance",
19
+ "Initial Compliance Audit",
20
+ "Basic Fatigue",
21
+ "Compliance Audit",
22
+ "Advanced Fatigue",
23
+ "Spot Check",
24
+ "Triggered Audit"
25
+ ],
26
+ "priority": 90 # High priority for direct match
27
+ },
28
+ "Audit Information": {
29
+ "orientation": "left",
30
+ "labels": [
31
+ "Date of Audit",
32
+ "Location of audit",
33
+ "Auditor name",
34
+ "Audit Matrix Identifier (Name or Number)",
35
+ "Auditor Exemplar Global Reg No.",
36
+ "expiry Date:",
37
+ "NHVR Auditor Registration Number",
38
+ "expiry Date:"
39
+ ],
40
+ "priority": 80
41
+ },
42
+ "Operator Information": {
43
+ "headings": [
44
+ {"level": 1, "text": "Operator Information"}
45
+ ],
46
+ "orientation": "left",
47
+ "labels": [
48
+ "Operator name (Legal entity)",
49
+ "NHVAS Accreditation No. (If applicable)",
50
+ "Registered trading name/s",
51
+ "Australian Company Number",
52
+ "NHVAS Manual (Policies and Procedures) developed by"
53
+ ],
54
+ "priority": 85
55
+ },
56
+ "Operator contact details": {
57
+ "orientation": "left",
58
+ "labels": [
59
+ "Operator business address",
60
+ "Operator Postal address",
61
+ "Email address",
62
+ "Operator Telephone Number"
63
+ ],
64
+ "priority": 75,
65
+ "context_keywords": ["contact", "address", "email", "telephone"]
66
+ },
67
+ "Attendance List (Names and Position Titles)": {
68
+ "headings": [
69
+ {"level": 1, "text": "NHVAS Audit Summary Report"}
70
+ ],
71
+ "orientation": "row1",
72
+ "labels": ["Attendance List (Names and Position Titles)"],
73
+ "priority": 90
74
+ },
75
+ "Nature of the Operators Business (Summary)": {
76
+ "orientation": "row1",
77
+ "labels": ["Nature of the Operators Business (Summary):"],
78
+ "split_labels": ["Accreditation Number:", "Expiry Date:"],
79
+ "priority": 85
80
+ },
81
+ "Accreditation Vehicle Summary": {
82
+ "orientation": "left",
83
+ "labels": ["Number of powered vehicles", "Number of trailing vehicles"],
84
+ "priority": 80
85
+ },
86
+ "Accreditation Driver Summary": {
87
+ "orientation": "left",
88
+ "labels": ["Number of drivers in BFM", "Number of drivers in AFM"],
89
+ "priority": 80
90
+ },
91
+ "Compliance Codes": {
92
+ "orientation": "left",
93
+ "labels": ["V", "NC", "TNC", "SFI", "NAP", "NA"],
94
+ "priority": 70,
95
+ "context_exclusions": ["MASS MANAGEMENT", "MAINTENANCE MANAGEMENT", "FATIGUE MANAGEMENT"]
96
+ },
97
+ "Corrective Action Request Identification": {
98
+ "orientation": "row1",
99
+ "labels": ["Title", "Abbreviation", "Description"],
100
+ "priority": 80
101
+ },
102
+
103
+ # 🎯 BASIC MANAGEMENT SCHEMAS (Compliance Tables - Lower Priority)
104
+ "Maintenance Management": {
105
+ "headings": [
106
+ {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
107
+ ],
108
+ "orientation": "left",
109
+ "labels": [
110
+ "Std 1. Daily Check",
111
+ "Std 2. Fault Recording and Reporting",
112
+ "Std 3. Fault Repair",
113
+ "Std 4. Maintenance Schedules and Methods",
114
+ "Std 5. Records and Documentation",
115
+ "Std 6. Responsibilities",
116
+ "Std 7. Internal Review",
117
+ "Std 8. Training and Education"
118
+ ],
119
+ "priority": 60,
120
+ "context_keywords": ["maintenance"],
121
+ "context_exclusions": ["summary", "details", "audit findings"] # Exclude Summary tables
122
+ },
123
+ "Mass Management": {
124
+ "headings": [
125
+ {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
126
+ ],
127
+ "orientation": "left",
128
+ "labels": [
129
+ "Std 1. Responsibilities",
130
+ "Std 2. Vehicle Control",
131
+ "Std 3. Vehicle Use",
132
+ "Std 4. Records and Documentation",
133
+ "Std 5. Verification",
134
+ "Std 6. Internal Review",
135
+ "Std 7. Training and Education",
136
+ "Std 8. Maintenance of Suspension"
137
+ ],
138
+ "priority": 60,
139
+ "context_keywords": ["mass"],
140
+ "context_exclusions": ["summary", "details", "audit findings"] # Exclude Summary tables
141
+ },
142
+ "Fatigue Management": {
143
+ "headings": [
144
+ {"level": 1, "text": "NHVAS AUDIT SUMMARY REPORT"}
145
+ ],
146
+ "orientation": "left",
147
+ "labels": [
148
+ "Std 1. Scheduling and Rostering",
149
+ "Std 2. Health and wellbeing for performed duty",
150
+ "Std 3. Training and Education",
151
+ "Std 4. Responsibilities and management practices",
152
+ "Std 5. Internal Review",
153
+ "Std 6. Records and Documentation",
154
+ "Std 7. Workplace conditions"
155
+ ],
156
+ "priority": 60,
157
+ "context_keywords": ["fatigue"],
158
+ "context_exclusions": ["summary", "details", "audit findings"] # Exclude Summary tables
159
+ },
160
+
161
+ # 🎯 SUMMARY MANAGEMENT SCHEMAS (Detailed Tables with DETAILS column - Higher Priority)
162
+ "Maintenance Management Summary": {
163
+ "headings": [
164
+ {"level": 1, "text": "Audit Observations and Comments"},
165
+ {"level": 2, "text": "Maintenance Management Summary of Audit findings"}
166
+ ],
167
+ "orientation": "left",
168
+ "columns": ["MAINTENANCE MANAGEMENT", "DETAILS"],
169
+ "labels": [
170
+ "Std 1. Daily Check",
171
+ "Std 2. Fault Recording and Reporting",
172
+ "Std 3. Fault Repair",
173
+ "Std 4. Maintenance Schedules and Methods",
174
+ "Std 5. Records and Documentation",
175
+ "Std 6. Responsibilities",
176
+ "Std 7. Internal Review",
177
+ "Std 8. Training and Education"
178
+ ],
179
+ "priority": 85, # Higher priority than basic Maintenance Management
180
+ "context_keywords": ["maintenance", "summary", "details", "audit findings"]
181
+ },
182
+ "Mass Management Summary": {
183
+ "headings": [
184
+ {"level": 1, "text": "Mass Management Summary of Audit findings"}
185
+ ],
186
+ "orientation": "left",
187
+ "columns": ["MASS MANAGEMENT", "DETAILS"],
188
+ "labels": [
189
+ "Std 1. Responsibilities",
190
+ "Std 2. Vehicle Control",
191
+ "Std 3. Vehicle Use",
192
+ "Std 4. Records and Documentation",
193
+ "Std 5. Verification",
194
+ "Std 6. Internal Review",
195
+ "Std 7. Training and Education",
196
+ "Std 8. Maintenance of Suspension"
197
+ ],
198
+ "priority": 85, # Higher priority than basic Mass Management
199
+ "context_keywords": ["mass", "summary", "details", "audit findings"]
200
+ },
201
+ "Fatigue Management Summary": {
202
+ "headings": [
203
+ {"level": 1, "text": "Fatigue Management Summary of Audit findings"}
204
+ ],
205
+ "orientation": "left",
206
+ "columns": ["FATIGUE MANAGEMENT", "DETAILS"],
207
+ "labels": [
208
+ "Std 1. Scheduling and Rostering",
209
+ "Std 2. Health and wellbeing for performed duty",
210
+ "Std 3. Training and Education",
211
+ "Std 4. Responsibilities and management practices",
212
+ "Std 5. Internal Review",
213
+ "Std 6. Records and Documentation",
214
+ "Std 7. Workplace conditions"
215
+ ],
216
+ "priority": 85, # Higher priority than basic Fatigue Management
217
+ "context_keywords": ["fatigue", "summary", "details", "audit findings"]
218
+ },
219
+
220
+ # Vehicle Registration Tables
221
+ "Vehicle Registration Numbers Mass": {
222
+ "headings": [
223
+ {"level": 1, "text": "Vehicle Registration Numbers of Records Examined"},
224
+ {"level": 2, "text": "MASS MANAGEMENT"}
225
+ ],
226
+ "orientation": "row1",
227
+ "labels": [
228
+ "No.", "Registration Number", "Sub contractor",
229
+ "Sub-contracted Vehicles Statement of Compliance",
230
+ "Weight Verification Records",
231
+ "RFS Suspension Certification #",
232
+ "Suspension System Maintenance", "Trip Records",
233
+ "Fault Recording/ Reporting on Suspension System"
234
+ ],
235
+ "priority": 90, # Higher priority
236
+ "context_keywords": ["mass", "vehicle registration", "rfs suspension", "weight verification"],
237
+ "context_exclusions": ["maintenance", "roadworthiness", "daily checks"] # Exclude maintenance-specific terms
238
+ },
239
+ "Vehicle Registration Numbers Maintenance": {
240
+ "headings": [
241
+ {"level": 1, "text": "Vehicle Registration Numbers of Records Examined"},
242
+ {"level": 2, "text": "Maintenance Management"}
243
+ ],
244
+ "orientation": "row1",
245
+ "labels": [
246
+ "No.", "Registration Number", "Roadworthiness Certificates",
247
+ "Maintenance Records", "Daily Checks",
248
+ "Fault Recording/ Reporting", "Fault Repair"
249
+ ],
250
+ "priority": 85, # Lower priority
251
+ "context_keywords": ["maintenance", "vehicle registration", "roadworthiness", "daily checks"],
252
+ "context_exclusions": ["mass", "rfs suspension", "weight verification"] # Exclude mass-specific terms
253
+ },
254
+ "Driver / Scheduler Records Examined": {
255
+ "headings": [
256
+ {"level": 1, "text": "Driver / Scheduler Records Examined"},
257
+ {"level": 2, "text": "FATIGUE MANAGEMENT"},
258
+ ],
259
+ "orientation": "row1",
260
+ "labels": [
261
+ "No.",
262
+ "Driver / Scheduler Name",
263
+ "Driver TLIF Course # Completed",
264
+ "Scheduler TLIF Course # Completed",
265
+ "Medical Certificates (Current Yes/No) Date of expiry",
266
+ "Roster / Schedule / Safe Driving Plan (Date Range)",
267
+ "Fit for Duty Statement Completed (Yes/No)",
268
+ "Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"
269
+ ],
270
+ "priority": 80,
271
+ "context_keywords": ["driver", "scheduler", "fatigue"]
272
+ },
273
+
274
+ # Other Tables
275
+ "Operator's Name (legal entity)": {
276
+ "headings": [
277
+ {"level": 1, "text": "CORRECTIVE ACTION REQUEST (CAR)"}
278
+ ],
279
+ "orientation": "left",
280
+ "labels": ["Operator's Name (legal entity)"],
281
+ "priority": 85
282
+ },
283
+ "Non-conformance and CAR details": {
284
+ "orientation": "left",
285
+ "labels": [
286
+ "Non-conformance agreed close out date",
287
+ "Module and Standard",
288
+ "Corrective Action Request (CAR) Number",
289
+ "Observed Non-conformance:",
290
+ "Corrective Action taken or to be taken by operator:",
291
+ "Operator or Representative Signature",
292
+ "Position",
293
+ "Date",
294
+ "Comments:",
295
+ "Auditor signature",
296
+ "Date"
297
+ ],
298
+ "priority": 75,
299
+ "context_keywords": ["non-conformance", "corrective action"]
300
+ },
301
+ "NHVAS Approved Auditor Declaration": {
302
+ "headings": [
303
+ {"level": 1, "text": "NHVAS APPROVED AUDITOR DECLARATION"}
304
+ ],
305
+ "orientation": "row1",
306
+ "labels": ["Print Name", "NHVR or Exemplar Global Auditor Registration Number"],
307
+ "priority": 90,
308
+ "context_keywords": ["auditor declaration", "NHVR"],
309
+ "context_exclusions": ["manager", "operator declaration"]
310
+ },
311
+ "Audit Declaration dates": {
312
+ "headings": [
313
+ {"level": 1, "text": "Audit Declaration dates"}
314
+ ],
315
+ "orientation": "left",
316
+ "labels": [
317
+ "Audit was conducted on",
318
+ "Unconditional CARs closed out on:",
319
+ "Conditional CARs to be closed out by:"
320
+ ],
321
+ "priority": 80
322
+ },
323
+ "Print accreditation name": {
324
+ "headings": [
325
+ {"level": 1, "text": "(print accreditation name)"}
326
+ ],
327
+ "orientation": "left",
328
+ "labels": ["(print accreditation name)"],
329
+ "priority": 85
330
+ },
331
+ "Operator Declaration": {
332
+ "headings": [
333
+ {"level": 1, "text": "Operator Declaration"}
334
+ ],
335
+ "orientation": "row1",
336
+ "labels": ["Print Name", "Position Title"],
337
+ "priority": 90,
338
+ "context_keywords": ["operator declaration", "manager"],
339
+ "context_exclusions": ["auditor", "nhvas approved"]
340
+ }
341
+ }
342
+
343
+ # 2. Enhanced heading detection patterns
344
+ HEADING_PATTERNS = {
345
+ "main": [
346
+ r"NHVAS\s+Audit\s+Summary\s+Report",
347
+ r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
348
+ r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
349
+ ],
350
+ "sub": [
351
+ r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
352
+ r"MAINTENANCE\s+MANAGEMENT",
353
+ r"MASS\s+MANAGEMENT",
354
+ r"FATIGUE\s+MANAGEMENT",
355
+ r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
356
+ r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
357
+ r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
358
+ r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
359
+ r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
360
+ r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
361
+ r"Operator\s+Declaration",
362
+ r"Operator\s+Information"
363
+ ]
364
+ }
365
+
366
+ # 3. Enhanced paragraph patterns for key narrative sections
367
+ PARAGRAPH_PATTERNS = {
368
+ "findings_summary": r"Provide a summary of findings based on the evidence gathered during the audit\.",
369
+ "declaration_text": r"I hereby acknowledge and agree with the findings.*",
370
+ "introductory_note": r"This audit assesses the.*",
371
+ "date_line": r"^\s*\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4}\s*$|^Date$"
372
+ }
space-pdf/packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ tesseract-ocr
space-pdf/requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.111.1
2
+ pydantic==2.11.0
3
+ python-multipart==0.0.9
4
+ uvicorn==0.30.3
5
+ gunicorn==22.0.0
6
+ requests==2.32.3
7
+ torch==2.4.0
8
+ torchvision==0.19.0
9
+ Pillow==10.4.0
10
+ pdf-annotate==0.12.0
11
+ scipy==1.14.0
12
+ opencv-python==4.10.0.84
13
+ Shapely==2.0.5
14
+ transformers==4.40.2
15
+ huggingface_hub==0.23.5
16
+ pdf2image==1.17.0
17
+ lightgbm==4.5.0
18
+ setuptools==75.4.0
19
+ roman==4.2
20
+ hydra-core==1.3.2
21
+ pypandoc==1.13
22
+ rapid-table==2.0.3
23
+ rapidocr==3.2.0
24
+ pix2tex==0.1.4
25
+ latex2mathml==3.78.0
26
+ PyMuPDF==1.25.5
27
+ git+https://github.com/huridocs/pdf-features.git@2025.7.30.1
28
+ gradio==4.44.1
29
+ pytesseract
30
+ python-docx
31
+ camelot-py[cv] # for digital-table parsing
32
+ pdf2image # for fallback OCR on images
33
+ pytesseract
34
+ Pillow
35
+ rapidfuzz
36
+ pdfplumber
37
+ openai
space-pdf/update_docx_with_pdf.py ADDED
@@ -0,0 +1,1470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced NHVAS PDF to DOCX JSON Merger
4
+ Comprehensive extraction and mapping from PDF to DOCX structure
5
+ (keep pipeline intact; fix spacing, operator info mapping, vehicle-reg header mapping, date fallback)
6
+ """
7
+ import json
8
+ import re
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Optional
12
+ from collections import OrderedDict # <-- add this
13
+
14
+
15
+ def _nz(x):
16
+ return x if isinstance(x, str) and x.strip() else ""
17
+
18
+ SUMMARY_SECTIONS = {
19
+ "MAINTENANCE MANAGEMENT": "Maintenance Management Summary",
20
+ "MASS MANAGEMENT": "Mass Management Summary",
21
+ "FATIGUE MANAGEMENT": "Fatigue Management Summary",
22
+ }
23
+
24
+ # ───────────────────────────── helpers: text cleanup & label matching ─────────────────────────────
25
+ def _canon_header(s: str) -> str:
26
+ if not s: return ""
27
+ s = re.sub(r"\s+", " ", str(s)).strip().lower()
28
+ s = s.replace("–", "-").replace("—", "-")
29
+ s = re.sub(r"[/]+", " / ", s)
30
+ s = re.sub(r"[^a-z0-9#/ ]+", " ", s)
31
+ s = re.sub(r"\s+", " ", s).strip()
32
+ return s
33
+
34
+
35
+ # Header aliases -> internal keys we already use later during mapping
36
+ _VEH_HEADER_ALIASES = {
37
+ # common
38
+ "registration number": "registration",
39
+ "reg no": "registration",
40
+ "reg.#": "registration",
41
+ "no.": "no",
42
+ "no": "no",
43
+
44
+ # maintenance table
45
+ "roadworthiness certificates": "roadworthiness",
46
+ "maintenance records": "maintenance_records",
47
+ "daily checks": "daily_checks",
48
+ "fault recording reporting": "fault_recording",
49
+ "fault recording / reporting": "fault_recording",
50
+ "fault repair": "fault_repair",
51
+
52
+ # mass table
53
+ "sub contractor": "sub_contractor",
54
+ "sub-contractor": "sub_contractor",
55
+ "sub contracted vehicles statement of compliance": "sub_comp",
56
+ "sub-contracted vehicles statement of compliance": "sub_comp",
57
+ "weight verification records": "weight_verification",
58
+ "rfs suspension certification #": "rfs_certification",
59
+ "rfs suspension certification number": "rfs_certification",
60
+ "suspension system maintenance": "suspension_maintenance",
61
+ "trip records": "trip_records",
62
+ "fault recording reporting on suspension system": "fault_reporting_suspension",
63
+ "fault recording / reporting on suspension system": "fault_reporting_suspension",
64
+ }
65
+
66
+ # --- helpers ---
67
+ def build_vehicle_sections(extracted: dict) -> dict:
68
+ """Build arrays for Maintenance and Mass tables. Maintenance uses recorded rows to include ALL entries."""
69
+ maint = {
70
+ "Registration Number": [],
71
+ "Roadworthiness Certificates": [],
72
+ "Maintenance Records": [],
73
+ "Daily Checks": [],
74
+ "Fault Recording/ Reporting": [],
75
+ "Fault Repair": [],
76
+ }
77
+ mass = {
78
+ "Registration Number": [],
79
+ "Weight Verification Records": [],
80
+ "RFS Suspension Certification #": [],
81
+ "Suspension System Maintenance": [],
82
+ "Trip Records": [],
83
+ "Fault Recording/ Reporting on Suspension System": [],
84
+ }
85
+
86
+ # Prefer authoritative maintenance rows captured during parsing (spans all pages)
87
+ if extracted.get("_maint_rows"):
88
+ for row in extracted["_maint_rows"]:
89
+ maint["Registration Number"].append(_smart_space(row.get("registration", "")))
90
+ maint["Roadworthiness Certificates"].append(_nz(row.get("roadworthiness", "")))
91
+ maint["Maintenance Records"].append(_nz(row.get("maintenance_records", "")))
92
+ maint["Daily Checks"].append(_nz(row.get("daily_checks", "")))
93
+ maint["Fault Recording/ Reporting"].append(_nz(row.get("fault_recording", "")))
94
+ maint["Fault Repair"].append(_nz(row.get("fault_repair", "")))
95
+ else:
96
+ # Fallback to vehicles map (older behavior)
97
+ for v in extracted.get("vehicles", []) or []:
98
+ if not v.get("registration"): continue
99
+ if v.get("seen_in_maintenance") or any(v.get(k) for k in ["roadworthiness","maintenance_records","daily_checks","fault_recording","fault_repair"]):
100
+ rw = _nz(v.get("roadworthiness", "")); mr = _nz(v.get("maintenance_records", "")); dc = _nz(v.get("daily_checks", ""))
101
+ fr = _nz(v.get("fault_recording", "")); rp = _nz(v.get("fault_repair", ""))
102
+ if not mr and dc: mr = dc
103
+ if not rp and fr: rp = fr
104
+ if not fr and rp: fr = rp
105
+ maint["Registration Number"].append(_smart_space(v["registration"]))
106
+ maint["Roadworthiness Certificates"].append(rw)
107
+ maint["Maintenance Records"].append(mr)
108
+ maint["Daily Checks"].append(dc)
109
+ maint["Fault Recording/ Reporting"].append(fr)
110
+ maint["Fault Repair"].append(rp)
111
+
112
+ # Mass stays as-is (from vehicles)
113
+ for v in extracted.get("vehicles", []) or []:
114
+ if not v.get("registration"): continue
115
+ if v.get("seen_in_mass") or any(v.get(k) for k in ["weight_verification","rfs_certification","suspension_maintenance","trip_records","fault_reporting_suspension"]):
116
+ mass["Registration Number"].append(_smart_space(v["registration"]))
117
+ mass["Weight Verification Records"].append(_nz(v.get("weight_verification", "")))
118
+ mass["RFS Suspension Certification #"].append(_nz(v.get("rfs_certification", "")))
119
+ mass["Suspension System Maintenance"].append(_nz(v.get("suspension_maintenance", "")))
120
+ mass["Trip Records"].append(_nz(v.get("trip_records", "")))
121
+ mass["Fault Recording/ Reporting on Suspension System"].append(_nz(v.get("fault_reporting_suspension", "")))
122
+
123
+ return {
124
+ "Vehicle Registration Numbers Maintenance": maint,
125
+ "Vehicle Registration Numbers Mass": mass,
126
+ }
127
+
128
+
129
+ def _map_header_indices(headers: list[str]) -> dict:
130
+ """Return {internal_key: column_index} by matching/aliasing header text."""
131
+ idx = {}
132
+ for i, h in enumerate(headers or []):
133
+ ch = _canon_header(h)
134
+ # try direct alias
135
+ if ch in _VEH_HEADER_ALIASES:
136
+ idx[_VEH_HEADER_ALIASES[ch]] = i
137
+ continue
138
+ # relax a little for 'registration number' variants
139
+ if "registration" in ch and "number" in ch:
140
+ idx["registration"] = i
141
+ continue
142
+ if "roadworthiness" in ch:
143
+ idx["roadworthiness"] = i
144
+ continue
145
+ if "maintenance" in ch and "records" in ch:
146
+ idx["maintenance_records"] = i
147
+ continue
148
+ if "daily" in ch and "check" in ch:
149
+ idx["daily_checks"] = i
150
+ continue
151
+ if "fault" in ch and "record" in ch and "suspension" not in ch:
152
+ # maintenance fault-recording column
153
+ if "repair" in ch:
154
+ idx["fault_repair"] = i
155
+ else:
156
+ idx["fault_recording"] = i
157
+ continue
158
+ if "weight" in ch and "verification" in ch:
159
+ idx["weight_verification"] = i
160
+ continue
161
+ if "rfs" in ch and "certification" in ch:
162
+ idx["rfs_certification"] = i
163
+ continue
164
+ if "suspension" in ch and "maintenance" in ch:
165
+ idx["suspension_maintenance"] = i
166
+ continue
167
+ if "trip" in ch and "record" in ch:
168
+ idx["trip_records"] = i
169
+ continue
170
+ if "fault" in ch and "report" in ch and "suspension" in ch:
171
+ idx["fault_reporting_suspension"] = i
172
+ continue
173
+ return idx
174
+
175
+ def _canon(s: str) -> str:
176
+ if not s: return ""
177
+ s = re.sub(r"\s+", " ", str(s)).strip().lower()
178
+ s = re.sub(r"[^a-z0-9#]+", " ", s)
179
+ return re.sub(r"\s+", " ", s).strip()
180
+
181
+ def _smart_space(s: str) -> str:
182
+ if not s: return s
183
+ s = str(s)
184
+
185
+ # Insert spaces at typical OCR glue points
186
+ s = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
187
+ s = re.sub(r'([A-Za-z])(\d)', r'\1 \2', s)
188
+ s = re.sub(r'(\d)([A-Za-z])', r'\1 \2', s)
189
+ s = re.sub(r'([A-Z]{2,})(\d)', r'\1 \2', s)
190
+
191
+ # Fix common glued tokens
192
+ s = s.replace("POBox", "PO Box")
193
+
194
+ # Compact ordinals back together: "9 th" -> "9th", but preserve a space after the ordinal if followed by a word
195
+ s = re.sub(r'\b(\d+)\s*(st|nd|rd|th)\b', r'\1\2', s)
196
+
197
+ s = re.sub(r"\s+", " ", s).strip()
198
+ return s
199
+
200
+ def looks_like_plate(s: str) -> bool:
201
+ if not s: return False
202
+ t = re.sub(r"[\s-]", "", str(s).upper())
203
+ if not (5 <= len(t) <= 8): return False
204
+ if not re.fullmatch(r"[A-Z0-9]+", t): return False
205
+ if sum(c.isalpha() for c in t) < 2: return False
206
+ if sum(c.isdigit() for c in t) < 2: return False
207
+ if t in {"ENTRY","YES","NO","N/A","NA"}: return False
208
+ return True
209
+
210
+ def is_dateish(s: str) -> bool:
211
+ if not s: return False
212
+ s = _smart_space(s)
213
+ # tokens like 03/22, 20/02/2023, 01.02.21, 2023-02-20
214
+ return bool(re.search(r"\b\d{1,4}(?:[./-]\d{1,2}){1,2}\b", s))
215
+
216
+ def extract_date_tokens(s: str) -> list[str]:
217
+ if not s: return []
218
+ s = _smart_space(s)
219
+ return re.findall(r"\b\d{1,4}(?:[./-]\d{1,2}){1,2}\b", s)
220
+
221
+
222
+ def _clean_list(vals: List[str]) -> List[str]:
223
+ out = []
224
+ for v in vals:
225
+ v = _smart_space(v)
226
+ if v:
227
+ out.append(v)
228
+ return out
229
+
230
+ def _looks_like_manual_value(s: str) -> bool:
231
+ if not s: return False
232
+ s = s.strip()
233
+ # reject pure digits (e.g., "51902") and very short tokens
234
+ if re.fullmatch(r"\d{3,}", s):
235
+ return False
236
+ # accept if it has any letters or typical version hints
237
+ return bool(re.search(r"[A-Za-z]", s))
238
+
239
+ def _looks_like_company(s: str) -> bool:
240
+ """Very light validation to avoid capturing labels as values."""
241
+ if not s: return False
242
+ s = _smart_space(s)
243
+ # at least two words containing letters (e.g., "Kangaroo Transport")
244
+ return bool(re.search(r"[A-Za-z]{2,}\s+[A-Za-z&]{2,}", s))
245
+
246
+ # ───────────────────────────── label index (non-summary only; no values) ─────────────────────────────
247
+ LABEL_INDEX: Dict[str, Dict[str, Dict[str, Any]]] = {
248
+ "Audit Information": {
249
+ "Date of Audit": {"alts": ["Date of Audit"]},
250
+ "Location of audit": {"alts": ["Location of audit", "Location"]},
251
+ "Auditor name": {"alts": ["Auditor name", "Auditor"]},
252
+ "Audit Matrix Identifier (Name or Number)": {"alts": ["Audit Matrix Identifier (Name or Number)", "Audit Matrix Identifier"]},
253
+ "Auditor Exemplar Global Reg No.": {"alts": ["Auditor Exemplar Global Reg No."]},
254
+ "NHVR Auditor Registration Number": {"alts": ["NHVR Auditor Registration Number"]},
255
+ "expiry Date:": {"alts": ["expiry Date:", "Expiry Date:"]},
256
+ },
257
+ "Operator Information": {
258
+ "Operator name (Legal entity)": {"alts": ["Operator name (Legal entity)", "Operator's Name (legal entity)"]},
259
+ "NHVAS Accreditation No. (If applicable)": {"alts": ["NHVAS Accreditation No. (If applicable)", "NHVAS Accreditation No."]},
260
+ "Registered trading name/s": {"alts": ["Registered trading name/s", "Trading name/s"]},
261
+ "Australian Company Number": {"alts": ["Australian Company Number", "ACN"]},
262
+ "NHVAS Manual (Policies and Procedures) developed by": {"alts": [
263
+ "NHVAS Manual (Policies and Procedures) developed by",
264
+ "NHVAS Manual developed by",
265
+ "Manual developed by"
266
+ ]},
267
+ },
268
+ "Operator contact details": {
269
+ "Operator business address": {"alts": ["Operator business address", "Business address"]},
270
+ "Operator Postal address": {"alts": ["Operator Postal address", "Postal address"]},
271
+ "Email address": {"alts": ["Email address", "Email"]},
272
+ "Operator Telephone Number": {"alts": ["Operator Telephone Number", "Telephone", "Phone"]},
273
+ },
274
+ "Attendance List (Names and Position Titles)": {
275
+ "Attendance List (Names and Position Titles)": {"alts": ["Attendance List (Names and Position Titles)", "Attendance List"]},
276
+ },
277
+ "Nature of the Operators Business (Summary)": {
278
+ "Nature of the Operators Business (Summary):": {"alts": ["Nature of the Operators Business (Summary):"]},
279
+ },
280
+ "Accreditation Vehicle Summary": {
281
+ "Number of powered vehicles": {"alts": ["Number of powered vehicles"]},
282
+ "Number of trailing vehicles": {"alts": ["Number of trailing vehicles"]},
283
+ },
284
+ "Accreditation Driver Summary": {
285
+ "Number of drivers in BFM": {"alts": ["Number of drivers in BFM"]},
286
+ "Number of drivers in AFM": {"alts": ["Number of drivers in AFM"]},
287
+ },
288
+ "Vehicle Registration Numbers Maintenance": {
289
+ "No.": {"alts": ["No.", "No"]},
290
+ "Registration Number": {"alts": ["Registration Number", "Registration"]},
291
+ "Roadworthiness Certificates": {"alts": ["Roadworthiness Certificates", "Roadworthiness"]},
292
+ "Maintenance Records": {"alts": ["Maintenance Records"]},
293
+ "Daily Checks": {"alts": ["Daily Checks", "Daily Check"]},
294
+ "Fault Recording/ Reporting": {"alts": ["Fault Recording/ Reporting", "Fault Recording / Reporting"]},
295
+ "Fault Repair": {"alts": ["Fault Repair"]},
296
+ },
297
+ "Vehicle Registration Numbers Mass": {
298
+ "No.": {"alts": ["No.", "No"]},
299
+ "Registration Number": {"alts": ["Registration Number", "Registration"]},
300
+ "Sub contractor": {"alts": ["Sub contractor", "Sub-contractor"]},
301
+ "Sub-contracted Vehicles Statement of Compliance": {"alts": ["Sub-contracted Vehicles Statement of Compliance"]},
302
+ "Weight Verification Records": {"alts": ["Weight Verification Records"]},
303
+ "RFS Suspension Certification #": {"alts": ["RFS Suspension Certification #", "RFS Suspension Certification Number"]},
304
+ "Suspension System Maintenance": {"alts": ["Suspension System Maintenance"]},
305
+ "Trip Records": {"alts": ["Trip Records"]},
306
+ "Fault Recording/ Reporting on Suspension System": {"alts": ["Fault Recording/ Reporting on Suspension System"]},
307
+ },
308
+ "Driver / Scheduler Records Examined": {
309
+ "No.": {"alts": ["No.", "No"]},
310
+ "Driver / Scheduler Name": {"alts": ["Driver / Scheduler Name"]},
311
+ "Driver TLIF Course # Completed": {"alts": ["Driver TLIF Course # Completed"]},
312
+ "Scheduler TLIF Course # Completed": {"alts": ["Scheduler TLIF Course # Completed"]},
313
+ "Medical Certificates (Current Yes/No) Date of expiry": {"alts": ["Medical Certificates (Current Yes/No) Date of expiry"]},
314
+ "Roster / Schedule / Safe Driving Plan (Date Range)": {"alts": ["Roster / Schedule / Safe Driving Plan (Date Range)"]},
315
+ "Fit for Duty Statement Completed (Yes/No)": {"alts": ["Fit for Duty Statement Completed (Yes/No)"]},
316
+ "Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)": {"alts": ["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"]},
317
+ },
318
+ "NHVAS Approved Auditor Declaration": {
319
+ "Print Name": {"alts": ["Print Name"]},
320
+ "NHVR or Exemplar Global Auditor Registration Number": {"alts": ["NHVR or Exemplar Global Auditor Registration Number"]},
321
+ },
322
+ "Audit Declaration dates": {
323
+ "Audit was conducted on": {"alts": ["Audit was conducted on"]},
324
+ "Unconditional CARs closed out on:": {"alts": ["Unconditional CARs closed out on:"]},
325
+ "Conditional CARs to be closed out by:": {"alts": ["Conditional CARs to be closed out by:"]},
326
+ },
327
+ "Print accreditation name": {
328
+ "(print accreditation name)": {"alts": ["(print accreditation name)"]},
329
+ },
330
+ "Operator Declaration": {
331
+ "Print Name": {"alts": ["Print Name"]},
332
+ "Position Title": {"alts": ["Position Title"]},
333
+ },
334
+ }
335
+
336
+ class NHVASMerger:
337
+ def __init__(self):
338
+ self.debug_mode = True
339
+ self._vehicle_by_reg = OrderedDict()
340
+
341
+ def log_debug(self, msg: str):
342
+ if self.debug_mode:
343
+ print(f"🔍 {msg}")
344
+
345
+ def normalize_std_label(self, label: str) -> str:
346
+ if not label: return ""
347
+ base = re.sub(r"\([^)]*\)", "", label)
348
+ base = re.sub(r"\s+", " ", base).strip()
349
+ m = re.match(r"^(Std\s*\d+\.\s*[^:]+?)\s*$", base, flags=re.IGNORECASE)
350
+ return m.group(1).strip() if m else base
351
+
352
+ def _pick_nearby(self, row, anchor_idx: int | None, want: str = "plate", window: int = 3) -> str:
353
+ """Return the best cell for a field by looking at the anchor index and nearby columns.
354
+ want ∈ {"plate","date","rf","yn"}"""
355
+ def cell(i):
356
+ if i is None or i < 0 or i >= len(row): return ""
357
+ v = row[i]
358
+ return v.strip() if isinstance(v, str) else str(v).strip()
359
+
360
+ # 1) try the anchor cell
361
+ cand = cell(anchor_idx)
362
+ if want == "plate" and looks_like_plate(cand): return _smart_space(cand)
363
+ if want == "date" and is_dateish(cand): return _smart_space(cand)
364
+ if want == "rf" and re.search(r"\bRF\s*\d+\b", cand, re.I): return _smart_space(re.search(r"\bRF\s*\d+\b", cand, re.I).group(0))
365
+ if want == "yn" and cand.strip().lower() in {"yes","no"}: return cand.strip().title()
366
+
367
+ # 2) scan a window around the anchor
368
+ if anchor_idx is not None:
369
+ for offset in range(1, window+1):
370
+ for i in (anchor_idx - offset, anchor_idx + offset):
371
+ c = cell(i)
372
+ if not c: continue
373
+ if want == "plate" and looks_like_plate(c): return _smart_space(c)
374
+ if want == "date" and is_dateish(c): return _smart_space(c)
375
+ if want == "rf":
376
+ m = re.search(r"\bRF\s*\d+\b", c, re.I)
377
+ if m: return _smart_space(m.group(0))
378
+ if want == "yn" and c.strip().lower() in {"yes","no"}: return c.strip().title()
379
+
380
+ # 3) last resort: scan whole row
381
+ joined = " ".join(str(c or "") for c in row)
382
+ if want == "plate":
383
+ for tok in joined.split():
384
+ if looks_like_plate(tok): return _smart_space(tok)
385
+ if want == "date":
386
+ tok = extract_date_tokens(joined)
387
+ return tok[0] if tok else ""
388
+ if want == "rf":
389
+ m = re.search(r"\bRF\s*\d+\b", joined, re.I)
390
+ return _smart_space(m.group(0)) if m else ""
391
+ if want == "yn":
392
+ j = f" {joined.lower()} "
393
+ if " yes " in j: return "Yes"
394
+ if " no " in j: return "No"
395
+ return ""
396
+
397
+
398
+ def _force_fill_maintenance_from_tables(self, pdf_data: Dict, merged: Dict) -> None:
399
+ """Overwrite Maintenance arrays by scanning ALL maintenance tables across pages."""
400
+ maint = merged.get("Vehicle Registration Numbers Maintenance")
401
+ if not isinstance(maint, dict):
402
+ return
403
+
404
+ tables = (pdf_data.get("extracted_data") or {}).get("all_tables") or []
405
+ regs, rw, mr, dc, fr, rp = [], [], [], [], [], []
406
+
407
+ for t in tables:
408
+ hdrs = [_canon_header(h or "") for h in t.get("headers") or []]
409
+ if not hdrs:
410
+ continue
411
+ # detect a maintenance table
412
+ txt = " ".join(hdrs)
413
+ if ("registration" not in txt) or not any(
414
+ k in txt for k in ["maintenance records", "daily", "fault recording", "fault repair", "roadworthiness"]
415
+ ):
416
+ continue
417
+
418
+ def fidx(pred):
419
+ for i, h in enumerate(hdrs):
420
+ if pred(h):
421
+ return i
422
+ return None
423
+
424
+ reg_i = fidx(lambda h: "registration" in h)
425
+ rw_i = fidx(lambda h: "roadworthiness" in h)
426
+ mr_i = fidx(lambda h: "maintenance" in h and "record" in h)
427
+ dc_i = fidx(lambda h: "daily" in h and "check" in h)
428
+ fr_i = fidx(lambda h: "fault" in h and "record" in h and "suspension" not in h)
429
+ rp_i = fidx(lambda h: "fault" in h and "repair" in h)
430
+
431
+ for r in t.get("data") or []:
432
+ def cell(i):
433
+ if i is None or i >= len(r): return ""
434
+ v = r[i]
435
+ return v.strip() if isinstance(v, str) else str(v).strip()
436
+
437
+ plate = _smart_space(cell(reg_i))
438
+ if not plate or not looks_like_plate(plate):
439
+ continue
440
+
441
+ v_rw = _nz(cell(rw_i))
442
+ v_mr = _nz(cell(mr_i))
443
+ v_dc = _nz(cell(dc_i))
444
+ v_fr = _nz(cell(fr_i))
445
+ v_rp = _nz(cell(rp_i))
446
+
447
+ # sensible fallbacks
448
+ if not v_mr and v_dc: v_mr = v_dc
449
+ if not v_rp and v_fr: v_rp = v_fr
450
+ if not v_fr and v_rp: v_fr = v_rp
451
+
452
+ regs.append(plate); rw.append(v_rw); mr.append(v_mr)
453
+ dc.append(v_dc); fr.append(v_fr); rp.append(v_rp)
454
+
455
+ if regs: # overwrite arrays only if we found rows
456
+ maint["Registration Number"] = regs
457
+ maint["Roadworthiness Certificates"] = rw
458
+ maint["Maintenance Records"] = mr
459
+ maint["Daily Checks"] = dc
460
+ maint["Fault Recording/ Reporting"] = fr
461
+ maint["Fault Repair"] = rp
462
+
463
+ def _collapse_multiline_headers(self, headers: List[str], data_rows: List[List[str]]):
464
+ """
465
+ Merge header continuation rows (when first data rows are not numeric '1.', '2.', …)
466
+ into the main headers, then return (merged_headers, remaining_data_rows).
467
+ """
468
+ merged = [_smart_space(h or "") for h in (headers or [])]
469
+ consumed = 0
470
+ header_frags: List[List[str]] = []
471
+
472
+ # Collect up to 5 leading rows that look like header fragments
473
+ for r in data_rows[:5]:
474
+ first = (str(r[0]).strip() if r else "")
475
+ if re.match(r"^\d+\.?$", first):
476
+ break # real data starts
477
+ consumed += 1
478
+ header_frags.append(r)
479
+
480
+ # Merge every collected fragment row into merged
481
+ for frag in header_frags:
482
+ for i, cell in enumerate(frag):
483
+ cell_txt = _smart_space(str(cell or "").strip())
484
+ if not cell_txt:
485
+ continue
486
+ if i >= len(merged):
487
+ merged.append(cell_txt)
488
+ else:
489
+ merged[i] = (merged[i] + " " + cell_txt).strip()
490
+
491
+ return merged, data_rows[consumed:]
492
+
493
+ def _first_attendance_name_title(self, att_list: List[str]) -> Optional[tuple[str, str]]:
494
+ """Return (print_name, position_title) from the first 'Name - Title' in attendance."""
495
+ if not att_list:
496
+ return None
497
+ # First "Name - Title", stop before next "Name -"
498
+ pat = re.compile(
499
+ r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})\s*-\s*(.*?)(?=(?:\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3}\s*-\s*)|$)'
500
+ )
501
+ for item in att_list:
502
+ s = _smart_space(str(item))
503
+ m = pat.search(s)
504
+ if m:
505
+ name = _smart_space(m.group(1))
506
+ title = _smart_space(m.group(2))
507
+ return name, title
508
+ return None
509
+
510
+
511
+ # ───────────────────────────── summary tables (unchanged logic) ─────────────────────────────
512
+ def build_summary_maps(self, pdf_json: dict) -> dict:
513
+ out = {v: {} for v in SUMMARY_SECTIONS.values()}
514
+ try:
515
+ tables = pdf_json["extracted_data"]["all_tables"]
516
+ except Exception:
517
+ return out
518
+
519
+ for t in tables:
520
+ headers = [re.sub(r"\s+", " ", (h or "")).strip().upper() for h in t.get("headers", [])]
521
+ if "DETAILS" not in headers:
522
+ continue
523
+ section_key_raw = next((h for h in headers if h in SUMMARY_SECTIONS), None)
524
+ if not section_key_raw:
525
+ continue
526
+ section_name = SUMMARY_SECTIONS[section_key_raw]
527
+ for row in t.get("data", []):
528
+ if not row: continue
529
+ left = str(row[0]) if len(row) >= 1 else ""
530
+ right = str(row[1]) if len(row) >= 2 else ""
531
+ left_norm = self.normalize_std_label(left)
532
+ if left_norm and right:
533
+ prev = out[section_name].get(left_norm, "")
534
+ merged_text = (prev + " " + right).strip() if prev else right.strip()
535
+ out[section_name][left_norm] = merged_text
536
+
537
+ for sec in out:
538
+ out[sec] = {k: [_smart_space(v)] for k, v in out[sec].items() if v}
539
+ return out
540
+
541
+ # ───────────────────────────── NEW: find cell by label in tables ─────────────────────────────
542
+ def _find_table_value(self, tables: List[Dict], label_variants: List[str]) -> Optional[str]:
543
+ targets = {_canon(v) for v in label_variants}
544
+ for t in tables:
545
+ data = t.get("data", [])
546
+ if not data: continue
547
+ for row in data:
548
+ if not row: continue
549
+ key = _canon(str(row[0]))
550
+ if key in targets:
551
+ vals = [str(c).strip() for c in row[1:] if str(c).strip()]
552
+ if vals:
553
+ return _smart_space(" ".join(vals))
554
+ return None
555
+
556
+ # ───────────────────────────── comprehensive extraction (minimal changes) ─────────────────────────────
557
+ def extract_from_pdf_comprehensive(self, pdf_data: Dict) -> Dict[str, Any]:
558
+ self._vehicle_by_reg.clear()
559
+ extracted = {}
560
+ extracted_data = pdf_data.get("extracted_data", {})
561
+ tables = extracted_data.get("all_tables", [])
562
+
563
+ # Capture "Audit was conducted on" from tables; ignore placeholder "Date"
564
+ awd = self._find_table_value(
565
+ tables,
566
+ LABEL_INDEX["Audit Declaration dates"]["Audit was conducted on"]["alts"]
567
+ )
568
+ if awd:
569
+ awd = _smart_space(awd)
570
+ if re.search(r"\d", awd) and not re.fullmatch(r"date", awd, re.I):
571
+ extracted["audit_conducted_date"] = awd
572
+
573
+
574
+
575
+ # 1) Audit Information (table first)
576
+ audit_info = extracted_data.get("audit_information", {})
577
+ if audit_info:
578
+ extracted["audit_info"] = {
579
+ "date_of_audit": _smart_space(audit_info.get("DateofAudit", "")),
580
+ "location": _smart_space(audit_info.get("Locationofaudit", "")),
581
+ "auditor_name": _smart_space(audit_info.get("Auditorname", "")),
582
+ "matrix_id": _smart_space(audit_info.get("AuditMatrixIdentifier (Name or Number)", "")),
583
+ }
584
+ # If missing, try generic table lookup
585
+ for label, meta in LABEL_INDEX.get("Audit Information", {}).items():
586
+ if label == "expiry Date:": # not used in your DOCX example
587
+ continue
588
+ val = self._find_table_value(tables, meta.get("alts", [label]))
589
+ if val:
590
+ extracted.setdefault("audit_info", {})
591
+ if _canon(label) == _canon("Date of Audit"): extracted["audit_info"]["date_of_audit"] = val
592
+ elif _canon(label) == _canon("Location of audit"): extracted["audit_info"]["location"] = val
593
+ elif _canon(label) == _canon("Auditor name"): extracted["audit_info"]["auditor_name"] = val
594
+ elif _canon(label) == _canon("Audit Matrix Identifier (Name or Number)"): extracted["audit_info"]["matrix_id"] = val
595
+
596
+ # 2) Operator Information (prefer table rows)
597
+ operator_info = extracted_data.get("operator_information", {})
598
+ if operator_info:
599
+ extracted["operator_info"] = {
600
+ "name": "",
601
+ "trading_name": _smart_space(operator_info.get("trading_name", "")),
602
+ "acn": _smart_space(operator_info.get("company_number", "")),
603
+ "manual": _smart_space(operator_info.get("nhvas_accreditation", "")),
604
+ "business_address": _smart_space(operator_info.get("business_address", "")),
605
+ "postal_address": _smart_space(operator_info.get("postal_address", "")),
606
+ "email": operator_info.get("email", ""),
607
+ "phone": _smart_space(operator_info.get("phone", "")),
608
+ }
609
+
610
+ # Fill operator info via table lookup
611
+ for label, meta in LABEL_INDEX.get("Operator Information", {}).items():
612
+ val = self._find_table_value(tables, meta.get("alts", [label]))
613
+ if not val: continue
614
+ if _canon(label) == _canon("Operator name (Legal entity)") and _looks_like_company(val):
615
+ extracted.setdefault("operator_info", {})
616
+ extracted["operator_info"]["name"] = val
617
+ elif _canon(label) == _canon("Registered trading name/s"):
618
+ extracted.setdefault("operator_info", {})
619
+ extracted["operator_info"]["trading_name"] = val
620
+ elif _canon(label) == _canon("Australian Company Number"):
621
+ extracted.setdefault("operator_info", {})
622
+ extracted["operator_info"]["acn"] = val
623
+ elif _canon(label) == _canon("NHVAS Manual (Policies and Procedures) developed by"):
624
+ extracted.setdefault("operator_info", {})
625
+ if _looks_like_manual_value(val):
626
+ extracted["operator_info"]["manual"] = val
627
+
628
+ # 3) Generic table parsing (unchanged logic for other sections)
629
+ self._extract_table_data(tables, extracted)
630
+
631
+ # 4) Text parsing (kept, but spacing applied)
632
+ self._extract_text_content(extracted_data.get("all_text_content", []), extracted)
633
+ # Vehicle tables sometimes fail to land in all_tables; parse from text as a fallback
634
+ self._extract_vehicle_tables_from_text(extracted_data.get("all_text_content", []), extracted)
635
+
636
+ # 5) Vehicle/Driver data (kept)
637
+ self._extract_vehicle_driver_data(extracted_data, extracted)
638
+
639
+ # 6) Detailed mgmt (kept)
640
+ self._extract_detailed_management_data(extracted_data, extracted)
641
+
642
+ return extracted
643
+
644
+ # ───────────────────────────── table classifiers ─────────────────────────────
645
+ # replace your _extract_table_data with this version
646
+ def _extract_table_data(self, tables: List[Dict], extracted: Dict):
647
+ for table in tables:
648
+ headers = table.get("headers", []) or []
649
+ data_rows = table.get("data", []) or []
650
+ if not data_rows:
651
+ continue
652
+
653
+ page_num = table.get("page", 0)
654
+ self.log_debug(f"Processing table on page {page_num} with headers: {headers[:3]}...")
655
+
656
+ # 🔧 NEW: collapse possible multi-line headers once up front
657
+ collapsed_headers, collapsed_rows = self._collapse_multiline_headers(headers, data_rows)
658
+
659
+ # 🔧 Try vehicle tables FIRST using either raw or collapsed headers
660
+ if self._is_vehicle_registration_table(headers) or self._is_vehicle_registration_table(collapsed_headers):
661
+ # always extract with the collapsed header/rows so we see "Registration Number", etc.
662
+ self._extract_vehicle_registration_table(collapsed_headers, collapsed_rows, extracted, page_num)
663
+ continue
664
+
665
+ # the rest keep their existing order/logic (use the original headers/rows)
666
+ if self._is_audit_info_table(headers):
667
+ self._extract_audit_info_table(data_rows, extracted)
668
+ elif self._is_operator_info_table(headers):
669
+ self._extract_operator_info_table(data_rows, extracted)
670
+ elif self._is_attendance_table(headers):
671
+ self._extract_attendance_table(data_rows, extracted)
672
+ elif self._is_vehicle_summary_table(headers):
673
+ self._extract_vehicle_summary_table(data_rows, extracted)
674
+ elif self._is_driver_table(headers):
675
+ self._extract_driver_table(headers, data_rows, extracted)
676
+ elif self._is_management_compliance_table(headers):
677
+ self._extract_management_table(data_rows, extracted, headers)
678
+
679
+
680
+ def _is_audit_info_table(self, headers: List[str]) -> bool:
681
+ txt = " ".join(str(h) for h in headers).lower()
682
+ return any(t in txt for t in ["audit", "date", "location", "auditor"])
683
+
684
+ def _is_operator_info_table(self, headers: List[str]) -> bool:
685
+ txt = " ".join(str(h) for h in headers).lower()
686
+ return any(t in txt for t in ["operator", "company", "trading", "address"])
687
+
688
+ def _is_attendance_table(self, headers: List[str]) -> bool:
689
+ txt = " ".join(str(h) for h in headers).lower()
690
+ return "attendance" in txt
691
+
692
+ def _is_vehicle_summary_table(self, headers: List[str]) -> bool:
693
+ txt = " ".join(str(h) for h in headers).lower()
694
+ return any(t in txt for t in ["powered vehicles", "trailing vehicles", "drivers in bfm"])
695
+
696
+ def _is_vehicle_registration_table(self, headers: List[str]) -> bool:
697
+ if not headers: return False
698
+ ch = [_canon_header(h) for h in headers]
699
+ has_reg = any(
700
+ ("registration" in h) or re.search(r"\breg(?:istration)?\b", h) or ("reg" in h and "no" in h)
701
+ for h in ch
702
+ )
703
+ others = ["roadworthiness","maintenance records","daily checks","fault recording","fault repair",
704
+ "sub contractor","sub-contractor","weight verification","rfs suspension","suspension system maintenance",
705
+ "trip records","fault recording reporting on suspension system","fault reporting suspension"]
706
+ has_signal = any(any(tok in h for tok in others) for h in ch)
707
+ return has_reg and has_signal
708
+
709
+ def _is_driver_table(self, headers: List[str]) -> bool:
710
+ txt = " ".join(str(h) for h in headers).lower()
711
+ return any(t in txt for t in ["driver", "scheduler", "tlif", "medical"])
712
+
713
+ def _is_management_compliance_table(self, headers: List[str]) -> bool:
714
+ txt = " ".join(str(h) for h in headers).lower()
715
+ return any(t in txt for t in ["maintenance management", "mass management", "fatigue management"])
716
+
717
+ def _extract_vehicle_tables_from_text(self, text_pages: List[Dict], extracted: Dict):
718
+ # flatten text
719
+ lines = []
720
+ for p in text_pages or []:
721
+ for ln in re.split(r"\s*\n\s*", p.get("text", "")):
722
+ ln = _smart_space(ln)
723
+ if ln: lines.append(ln)
724
+
725
+ maint_rows, mass_rows = [], []
726
+ rf_pat = re.compile(r"\bRF\s*\d+\b", re.IGNORECASE)
727
+
728
+ for ln in lines:
729
+ # find first token that looks like a rego
730
+ tokens = ln.split()
731
+ reg = next((t for t in tokens if looks_like_plate(t)), None)
732
+ if not reg:
733
+ continue
734
+
735
+ # everything after the reg on that line
736
+ tail = _smart_space(ln.split(reg, 1)[1]) if reg in ln else ""
737
+ dates = extract_date_tokens(tail)
738
+ has_rf = bool(rf_pat.search(ln)) or "suspension" in ln.lower()
739
+
740
+ if has_rf:
741
+ rfs = (rf_pat.search(ln).group(0).upper().replace(" ", "") if rf_pat.search(ln) else "")
742
+ wv = dates[0] if len(dates) > 0 else ""
743
+ rest = dates[1:]
744
+ mass_rows.append({
745
+ "registration": reg,
746
+ "sub_contractor": "Yes" if " yes " in f" {ln.lower()} " else ("No" if " no " in f" {ln.lower()} " else ""),
747
+ "sub_comp": "Yes" if " yes " in f" {ln.lower()} " else ("No" if " no " in f" {ln.lower()} " else ""),
748
+ "weight_verification": wv,
749
+ "rfs_certification": rfs or ("N/A" if "n/a" in ln.lower() else ""),
750
+ "suspension_maintenance": rest[0] if len(rest) > 0 else "",
751
+ "trip_records": rest[1] if len(rest) > 1 else "",
752
+ "fault_reporting_suspension": rest[2] if len(rest) > 2 else "",
753
+ })
754
+ else:
755
+ # map first 5 date-like tokens in sensible order; fallbacks keep table consistent
756
+ rw = dates[0] if len(dates) > 0 else ""
757
+ mr = dates[1] if len(dates) > 1 else ""
758
+ dc = dates[2] if len(dates) > 2 else ""
759
+ fr = dates[3] if len(dates) > 3 else ""
760
+ rp = dates[4] if len(dates) > 4 else ""
761
+ maint_rows.append({
762
+ "registration": reg,
763
+ "roadworthiness": rw,
764
+ "maintenance_records": mr or dc,
765
+ "daily_checks": dc,
766
+ "fault_recording": fr or rp,
767
+ "fault_repair": rp or fr,
768
+ })
769
+
770
+ # ... after building maint_rows and mass_rows ...
771
+ vlist = extracted.setdefault("vehicles", []) # ensure it always exists
772
+
773
+ if maint_rows or mass_rows:
774
+ for r in maint_rows:
775
+ r["section"] = "maintenance"
776
+ vlist.append(r)
777
+ for r in mass_rows:
778
+ r["section"] = "mass"
779
+ vlist.append(r)
780
+ self.log_debug(f"Vehicle rows (text fallback): maint={len(maint_rows)} mass={len(mass_rows)} total={len(vlist)}")
781
+ else:
782
+ self.log_debug("Vehicle rows (text fallback): none detected.")
783
+
784
+
785
+ # ───────────────────────────── simple extractors (spacing applied) ─────────────────────────────
786
+ def _extract_audit_info_table(self, data_rows: List[List], extracted: Dict):
787
+ ai = extracted.setdefault("audit_info", {})
788
+ for row in data_rows:
789
+ if len(row) < 2: continue
790
+ key = _canon(row[0])
791
+ val = _smart_space(" ".join(str(c).strip() for c in row[1:] if str(c).strip()))
792
+ if not val: continue
793
+ if "date" in key and "audit" in key: ai["date_of_audit"] = val
794
+ elif "location" in key: ai["location"] = val
795
+ elif "auditor" in key and "name" in key: ai["auditor_name"] = val
796
+ elif "matrix" in key: ai["matrix_id"] = val
797
+
798
+ def _extract_operator_info_table(self, data_rows: List[List], extracted: Dict):
799
+ oi = extracted.setdefault("operator_info", {})
800
+ for row in data_rows:
801
+ if len(row) < 2: continue
802
+ key = _canon(row[0])
803
+ val = _smart_space(" ".join(str(c).strip() for c in row[1:] if str(c).strip()))
804
+ if not val: continue
805
+ if "operator" in key and "name" in key and _looks_like_company(val): oi["name"] = val
806
+ elif "trading" in key: oi["trading_name"] = val
807
+ elif "australian" in key and "company" in key: oi["acn"] = val
808
+ elif "business" in key and "address" in key: oi["business_address"] = val
809
+ elif "postal" in key and "address" in key: oi["postal_address"] = val
810
+ elif "email" in key: oi["email"] = val
811
+ elif "telephone" in key or "phone" in key: oi["phone"] = val
812
+ elif "manual" in key or ("nhvas" in key and "manual" in key) or "developed" in key:
813
+ if _looks_like_manual_value(val):
814
+ oi["manual"] = val
815
+
816
+ def _extract_attendance_table(self, data_rows: List[List], extracted: Dict):
817
+ lst = []
818
+ for row in data_rows:
819
+ if not row: continue
820
+ cells = [str(c).strip() for c in row if str(c).strip()]
821
+ if not cells: continue
822
+ lst.append(_smart_space(" ".join(cells)))
823
+ if lst:
824
+ extracted["attendance"] = lst
825
+
826
+ def _extract_vehicle_summary_table(self, data_rows: List[List], extracted: Dict):
827
+ vs = extracted.setdefault("vehicle_summary", {})
828
+ for row in data_rows:
829
+ if len(row) < 2: continue
830
+ key = _canon(row[0])
831
+ value = ""
832
+ for c in row[1:]:
833
+ if str(c).strip():
834
+ value = _smart_space(str(c).strip()); break
835
+ if not value: continue
836
+ if "powered" in key and "vehicle" in key: vs["powered_vehicles"] = value
837
+ elif "trailing" in key and "vehicle" in key: vs["trailing_vehicles"] = value
838
+ elif "drivers" in key and "bfm" in key: vs["drivers_bfm"] = value
839
+ elif "drivers" in key and "afm" in key: vs["drivers_afm"] = value
840
+
841
+ # ▶▶ REPLACED: column mapping by headers
842
+ def _extract_vehicle_registration_table(self, headers, rows, extracted, page_num):
843
+ ch = [_canon_header(h) for h in (headers or [])]
844
+ alias = _map_header_indices(headers or [])
845
+
846
+ # header indices (may be misaligned vs data; that's OK, we’ll search near them)
847
+ def idx_of(*needles):
848
+ for i, h in enumerate(ch):
849
+ if all(n in h for n in needles): return i
850
+ return None
851
+
852
+ reg_i = alias.get("registration") or idx_of("registration number") or idx_of("registration") or idx_of("reg","no")
853
+ rw_i = alias.get("roadworthiness") or idx_of("roadworthiness")
854
+ maint_i = alias.get("maintenance_records") or idx_of("maintenance","records")
855
+ daily_i = alias.get("daily_checks") or idx_of("daily","check")
856
+ fr_i = alias.get("fault_recording") or idx_of("fault","recording")
857
+ rep_i = alias.get("fault_repair") or idx_of("fault","repair")
858
+
859
+ weight_i = alias.get("weight_verification") or idx_of("weight","verification")
860
+ rfs_i = alias.get("rfs_certification") or idx_of("rfs","certification")
861
+ susp_i = alias.get("suspension_maintenance") or idx_of("suspension","maintenance")
862
+ trip_i = alias.get("trip_records") or idx_of("trip","records")
863
+ frs_i = alias.get("fault_reporting_suspension") or idx_of("fault","reporting","suspension")
864
+
865
+ # classify table type by header signals
866
+ is_maint = any("roadworthiness" in h or "maintenance records" in h or ("daily" in h and "check" in h) or "fault repair" in h for h in ch)
867
+ is_mass = any("weight verification" in h or "rfs" in h or "suspension system" in h or "trip records" in h or "reporting on suspension" in h for h in ch)
868
+
869
+ maint_rows = extracted.setdefault("_maint_rows", []) if is_maint else None
870
+ added = 0
871
+
872
+ for r in rows or []:
873
+ # tolerant plate pick (handles misaligned columns)
874
+ reg = self._pick_nearby(r, reg_i, "plate", window=4)
875
+ if not reg or not looks_like_plate(reg):
876
+ continue
877
+
878
+ # collect values using tolerant picks
879
+ if is_maint:
880
+ rw = self._pick_nearby(r, rw_i, "date", window=4)
881
+ mr = self._pick_nearby(r, maint_i, "date", window=4)
882
+ dc = self._pick_nearby(r, daily_i, "date", window=4)
883
+ fr = self._pick_nearby(r, fr_i, "date", window=4)
884
+ rep = self._pick_nearby(r, rep_i, "date", window=4)
885
+
886
+ # sensible fallbacks
887
+ if not mr and dc: mr = dc
888
+ if not rep and fr: rep = fr
889
+ if not fr and rep: fr = rep
890
+
891
+ else: # mass or mixed
892
+ wv = self._pick_nearby(r, weight_i, "date", window=4)
893
+ rfs = self._pick_nearby(r, rfs_i, "rf", window=5)
894
+ sm = self._pick_nearby(r, susp_i, "date", window=4)
895
+ tr = self._pick_nearby(r, trip_i, "date", window=4)
896
+ frs = self._pick_nearby(r, frs_i, "date", window=4)
897
+ yn1 = self._pick_nearby(r, idx_of("sub","contractor"), "yn", window=3) or ""
898
+ yn2 = self._pick_nearby(r, idx_of("sub contracted vehicles statement of compliance"), "yn", window=3) or yn1
899
+
900
+ # merge into vehicle map
901
+ v = self._vehicle_by_reg.get(reg)
902
+ if v is None:
903
+ v = {"registration": reg}
904
+ self._vehicle_by_reg[reg] = v
905
+ added += 1
906
+
907
+ if is_maint:
908
+ v["seen_in_maintenance"] = True
909
+ if rw: v.setdefault("roadworthiness", rw)
910
+ if mr: v.setdefault("maintenance_records", mr)
911
+ if dc: v.setdefault("daily_checks", dc)
912
+ if fr: v.setdefault("fault_recording", fr)
913
+ if rep: v.setdefault("fault_repair", rep)
914
+
915
+ if maint_rows is not None:
916
+ maint_rows.append({
917
+ "registration": reg,
918
+ "roadworthiness": rw,
919
+ "maintenance_records": mr or dc,
920
+ "daily_checks": dc,
921
+ "fault_recording": fr or rep,
922
+ "fault_repair": rep or fr,
923
+ })
924
+ else:
925
+ v["seen_in_mass"] = True
926
+ if yn1: v.setdefault("sub_contractor", yn1)
927
+ if yn2: v.setdefault("sub_comp", yn2)
928
+ if wv: v.setdefault("weight_verification", wv)
929
+ if rfs: v.setdefault("rfs_certification", _smart_space(rfs).upper().replace(" ", ""))
930
+ if sm: v.setdefault("suspension_maintenance", sm)
931
+ if tr: v.setdefault("trip_records", tr)
932
+ if frs: v.setdefault("fault_reporting_suspension", frs)
933
+
934
+ extracted["vehicles"] = list(self._vehicle_by_reg.values())
935
+ return added
936
+
937
+ def _extract_driver_table(self, headers: List[str], data_rows: List[List], extracted: Dict):
938
+ """Header-driven extraction for Driver / Scheduler Records."""
939
+ drivers = []
940
+ ch = [_canon_header(h) for h in headers or []]
941
+
942
+ # helpers
943
+ def find_col(needles: list[str]) -> Optional[int]:
944
+ for i, h in enumerate(ch):
945
+ if any(n in h for n in needles):
946
+ return i
947
+ return None
948
+
949
+ def find_col_rx(patterns: list[str]) -> Optional[int]:
950
+ for i, h in enumerate(ch):
951
+ if any(re.search(p, h) for p in patterns):
952
+ return i
953
+ return None
954
+
955
+ name_idx = find_col_rx([r"\bdriver\s*/\s*scheduler\s*name\b",
956
+ r"\bdriver\s+name\b", r"\bscheduler\s+name\b", r"\bname\b"])
957
+ tlif_d_idx = find_col(["driver tlif"])
958
+ tlif_s_idx = find_col(["scheduler tlif"])
959
+ medical_idx= find_col(["medical", "expiry"])
960
+ roster_idx = find_col_rx([r"\broster\b", r"\bsafe\s+driving\s+plan\b", r"\bschedule\b(?!r\b)"])
961
+ fit_idx = find_col(["fit for duty"])
962
+ diary_idx = find_col(["work diary", "electronic work diary", "page numbers"])
963
+
964
+ for row in data_rows:
965
+ if not row:
966
+ continue
967
+
968
+ name = None
969
+ if name_idx is not None and name_idx < len(row):
970
+ name = _smart_space(str(row[name_idx]).strip())
971
+ if not name:
972
+ continue
973
+
974
+ d = {"name": name}
975
+
976
+ if tlif_d_idx is not None and tlif_d_idx < len(row):
977
+ d["driver_tlif"] = _smart_space(str(row[tlif_d_idx]).strip())
978
+ if tlif_s_idx is not None and tlif_s_idx < len(row):
979
+ d["scheduler_tlif"] = _smart_space(str(row[tlif_s_idx]).strip())
980
+ if medical_idx is not None and medical_idx < len(row):
981
+ d["medical_expiry"] = _smart_space(str(row[medical_idx]).strip())
982
+
983
+ # Roster/Schedule/SDP: prefer the detected column; accept only date/range-like, not the name
984
+ if roster_idx is not None and roster_idx < len(row):
985
+ raw_roster = _smart_space(str(row[roster_idx]).strip())
986
+ if raw_roster and re.search(r"[0-9/–-]", raw_roster) and raw_roster.lower() != name.lower():
987
+ d["roster_schedule"] = raw_roster
988
+
989
+ # Fallback: scan the row for the first date/range-like cell that's not the name cell
990
+ if "roster_schedule" not in d:
991
+ for j, cell in enumerate(row):
992
+ if j == name_idx:
993
+ continue
994
+ s = _smart_space(str(cell).strip())
995
+ if s and re.search(r"[0-9/–-]", s) and s.lower() != name.lower():
996
+ d["roster_schedule"] = s
997
+ break
998
+
999
+ if fit_idx is not None and fit_idx < len(row):
1000
+ d["fit_for_duty"] = _smart_space(str(row[fit_idx]).strip())
1001
+ if diary_idx is not None and diary_idx < len(row):
1002
+ d["work_diary"] = _smart_space(str(row[diary_idx]).strip())
1003
+
1004
+ drivers.append(d)
1005
+
1006
+ if drivers:
1007
+ extracted["drivers_detailed"] = drivers
1008
+ self.log_debug(f"Driver rows extracted (header-based): {len(drivers)}")
1009
+
1010
+
1011
+ def _extract_management_table(self, data_rows: List[List], extracted: Dict, headers: List[str]):
1012
+ txt = " ".join(str(h) for h in headers).lower()
1013
+ comp = {}
1014
+ for row in data_rows:
1015
+ if len(row) < 2: continue
1016
+ std = str(row[0]).strip()
1017
+ val = _smart_space(str(row[1]).strip())
1018
+ if std.startswith("Std") and val:
1019
+ comp[std] = val
1020
+ if comp:
1021
+ if "maintenance" in txt: extracted["maintenance_compliance"] = comp
1022
+ elif "mass" in txt: extracted["mass_compliance"] = comp
1023
+ elif "fatigue" in txt: extracted["fatigue_compliance"] = comp
1024
+
1025
+ def _extract_text_content(self, text_pages: List[Dict], extracted: Dict):
1026
+ all_text = " ".join(page.get("text", "") for page in text_pages)
1027
+ all_text = _smart_space(all_text)
1028
+
1029
+ # business summary
1030
+ patt = [
1031
+ r"Nature of the Operators? Business.*?:\s*(.*?)(?:Accreditation Number|Expiry Date|$)",
1032
+ r"Nature of.*?Business.*?Summary.*?:\s*(.*?)(?:Accreditation|$)"
1033
+ ]
1034
+ for p in patt:
1035
+ m = re.search(p, all_text, re.IGNORECASE | re.DOTALL)
1036
+ if m:
1037
+ txt = re.sub(r'\s+', ' ', m.group(1).strip())
1038
+ txt = re.sub(r'\s*(Accreditation Number.*|Expiry Date.*)', '', txt, flags=re.IGNORECASE)
1039
+ if len(txt) > 50:
1040
+ extracted["business_summary"] = txt
1041
+ break
1042
+
1043
+ # audit conducted date
1044
+ for p in [
1045
+ r"Audit was conducted on\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
1046
+ r"DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})",
1047
+ r"AUDITOR SIGNATURE\s+DATE\s+([0-9]+(?:st|nd|rd|th)?\s+[A-Za-z]+\s+\d{4})"
1048
+ ]:
1049
+ m = re.search(p, all_text, re.IGNORECASE)
1050
+ if m:
1051
+ extracted["audit_conducted_date"] = _smart_space(m.group(1).strip())
1052
+ break
1053
+
1054
+ # print accreditation name
1055
+ for p in [
1056
+ r"\(print accreditation name\)\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)",
1057
+ r"print accreditation name.*?\n\s*([A-Za-z0-9\s&().,'/\-]+?)(?:\s+DOES|\s+does|\n|$)"
1058
+ ]:
1059
+ m = re.search(p, all_text, re.IGNORECASE)
1060
+ if m:
1061
+ extracted["print_accreditation_name"] = _smart_space(m.group(1).strip())
1062
+ break
1063
+
1064
+ # numbers in text (optional)
1065
+ for p in [
1066
+ r"Number of powered vehicles\s+(\d+)",
1067
+ r"powered vehicles\s+(\d+)",
1068
+ r"Number of trailing vehicles\s+(\d+)",
1069
+ r"trailing vehicles\s+(\d+)",
1070
+ r"Number of drivers in BFM\s+(\d+)",
1071
+ r"drivers in BFM\s+(\d+)"
1072
+ ]:
1073
+ m = re.search(p, all_text, re.IGNORECASE)
1074
+ if m:
1075
+ val = m.group(1)
1076
+ if "powered" in p: extracted.setdefault("vehicle_summary", {})["powered_vehicles"] = val
1077
+ elif "trailing" in p: extracted.setdefault("vehicle_summary", {})["trailing_vehicles"] = val
1078
+ elif "bfm" in p.lower(): extracted.setdefault("vehicle_summary", {})["drivers_bfm"] = val
1079
+
1080
+ def _extract_detailed_management_data(self, extracted_data: Dict, extracted: Dict):
1081
+ all_tables = extracted_data.get("all_tables", [])
1082
+ for table in all_tables:
1083
+ headers = table.get("headers", [])
1084
+ data_rows = table.get("data", [])
1085
+ page_num = table.get("page", 0)
1086
+ if self._has_details_column(headers):
1087
+ section = self._identify_management_section(headers)
1088
+ if section:
1089
+ self._extract_management_details(data_rows, extracted, section)
1090
+ elif 6 <= page_num <= 15:
1091
+ self._extract_summary_by_content(data_rows, headers, extracted, page_num)
1092
+
1093
+ def _extract_summary_by_content(self, data_rows: List[List], headers: List[str], extracted: Dict, page_num: int):
1094
+ section_type = "maintenance" if 6 <= page_num <= 9 else "mass" if 10 <= page_num <= 12 else "fatigue" if 13 <= page_num <= 15 else None
1095
+ if not section_type: return
1096
+ details_key = f"{section_type}_summary_details"
1097
+ extracted[details_key] = {}
1098
+ for row in data_rows:
1099
+ if len(row) < 2: continue
1100
+ standard = str(row[0]).strip()
1101
+ details = _smart_space(str(row[1]).strip())
1102
+ if standard.startswith("Std") and details and len(details) > 10:
1103
+ m = re.search(r"Std\s+(\d+)\.\s*([^(]+)", standard)
1104
+ if m:
1105
+ key = f"Std {m.group(1)}. {m.group(2).strip()}"
1106
+ extracted[details_key][key] = details
1107
+
1108
+ def _has_details_column(self, headers: List[str]) -> bool:
1109
+ return "details" in " ".join(str(h) for h in headers).lower()
1110
+
1111
+ def _identify_management_section(self, headers: List[str]) -> Optional[str]:
1112
+ txt = " ".join(str(h) for h in headers).lower()
1113
+ if "maintenance" in txt: return "maintenance"
1114
+ if "mass" in txt: return "mass"
1115
+ if "fatigue" in txt: return "fatigue"
1116
+ return None
1117
+
1118
+ def _extract_management_details(self, data_rows: List[List], extracted: Dict, section: str):
1119
+ details_key = f"{section}_details"
1120
+ extracted[details_key] = {}
1121
+ for row in data_rows:
1122
+ if len(row) < 2: continue
1123
+ standard = str(row[0]).strip()
1124
+ details = _smart_space(str(row[1]).strip())
1125
+ if standard.startswith("Std") and details and details != "V" and len(details) > 10:
1126
+ m = re.search(r"Std\s+\d+\.\s*([^(]+)", standard)
1127
+ if m:
1128
+ extracted[details_key][m.group(1).strip()] = details
1129
+
1130
+ def _extract_vehicle_driver_data(self, extracted_data: Dict, extracted: Dict):
1131
+ vehicle_regs = extracted_data.get("vehicle_registrations", [])
1132
+ if vehicle_regs:
1133
+ extracted["vehicle_registrations"] = vehicle_regs
1134
+ driver_records = extracted_data.get("driver_records", [])
1135
+ if driver_records:
1136
+ extracted["driver_records"] = driver_records
1137
+
1138
+ # Add this method inside your NHVASMerger class, with proper indentation
1139
+ # Place it after the _extract_vehicle_driver_data method
1140
+
1141
+ def map_vehicle_registration_arrays(self, pdf_extracted: Dict, merged: Dict):
1142
+ """Extract and map vehicle registration data (Maintenance + Mass) to DOCX arrays."""
1143
+ vehicles_src = []
1144
+
1145
+ # Prefer rows we parsed ourselves (header-based). Fall back to curated list if present.
1146
+ if "vehicles" in pdf_extracted and isinstance(pdf_extracted["vehicles"], list):
1147
+ vehicles_src = pdf_extracted["vehicles"]
1148
+ elif "vehicle_registrations" in pdf_extracted and isinstance(pdf_extracted["vehicle_registrations"], list):
1149
+ # Normalize curated structure (list of dicts with keys like 'registration_number', etc.)
1150
+ for row in pdf_extracted["vehicle_registrations"]:
1151
+ if not isinstance(row, dict):
1152
+ continue
1153
+ v = {
1154
+ "registration": _smart_space(row.get("registration_number") or row.get("registration") or ""),
1155
+ # Maintenance table columns (names as seen in curated JSON)
1156
+ "roadworthiness": _smart_space(row.get("roadworthiness_certificates", "")),
1157
+ "maintenance_records": _smart_space(row.get("maintenance_records", "")),
1158
+ "daily_checks": _smart_space(row.get("daily_checks", "")),
1159
+ "fault_recording": _smart_space(row.get("fault_recording_reporting", "")),
1160
+ "fault_repair": _smart_space(row.get("fault_repair", "")),
1161
+ # Mass table columns (in case the curated list ever includes them)
1162
+ "sub_contractor": _smart_space(row.get("sub_contractor", "")),
1163
+ "sub_comp": _smart_space(row.get("sub_contracted_vehicles_statement_of_compliance", "")),
1164
+ "weight_verification": _smart_space(row.get("weight_verification_records", "")),
1165
+ "rfs_certification": _smart_space(row.get("rfs_suspension_certification", row.get("rfs_suspension_certification_#", ""))),
1166
+ "suspension_maintenance": _smart_space(row.get("suspension_system_maintenance", "")),
1167
+ "trip_records": _smart_space(row.get("trip_records", "")),
1168
+ "fault_reporting_suspension": _smart_space(row.get("fault_recording_reporting_on_suspension_system", "")),
1169
+ }
1170
+ if v["registration"]:
1171
+ vehicles_src.append(v)
1172
+
1173
+ if not vehicles_src:
1174
+ return # nothing to map
1175
+
1176
+ # Build column arrays
1177
+ regs = []
1178
+ roadworthiness = []
1179
+ maint_records = []
1180
+ daily_checks = []
1181
+ fault_recording = []
1182
+ fault_repair = []
1183
+
1184
+ sub_contractors = []
1185
+ weight_verification = []
1186
+ rfs_certification = []
1187
+ suspension_maintenance = []
1188
+ trip_records = []
1189
+ fault_reporting_suspension = []
1190
+
1191
+ for v in vehicles_src:
1192
+ reg = _smart_space(v.get("registration", "")).strip()
1193
+ if not reg:
1194
+ continue
1195
+ regs.append(reg)
1196
+
1197
+ roadworthiness.append(_smart_space(v.get("roadworthiness", "")).strip())
1198
+ maint_records.append(_smart_space(v.get("maintenance_records", "")).strip())
1199
+ daily_checks.append(_smart_space(v.get("daily_checks", "")).strip())
1200
+ fault_recording.append(_smart_space(v.get("fault_recording", "")).strip())
1201
+ fault_repair.append(_smart_space(v.get("fault_repair", "")).strip())
1202
+
1203
+ sub_contractors.append(_smart_space(v.get("sub_contractor", "")).strip())
1204
+ weight_verification.append(_smart_space(v.get("weight_verification", "")).strip())
1205
+ rfs_certification.append(_smart_space(v.get("rfs_certification", "")).strip())
1206
+ suspension_maintenance.append(_smart_space(v.get("suspension_maintenance", "")).strip())
1207
+ trip_records.append(_smart_space(v.get("trip_records", "")).strip())
1208
+ fault_reporting_suspension.append(_smart_space(v.get("fault_reporting_suspension", "")).strip())
1209
+
1210
+ # Update Maintenance table arrays (if present in template)
1211
+ if "Vehicle Registration Numbers Maintenance" in merged and regs:
1212
+ m = merged["Vehicle Registration Numbers Maintenance"]
1213
+ m["Registration Number"] = regs
1214
+ m["Roadworthiness Certificates"] = roadworthiness
1215
+ m["Maintenance Records"] = maint_records
1216
+ m["Daily Checks"] = daily_checks
1217
+ m["Fault Recording/ Reporting"] = fault_recording
1218
+ m["Fault Repair"] = fault_repair
1219
+
1220
+ # Update Mass table arrays (if present in template)
1221
+ if "Vehicle Registration Numbers Mass" in merged and regs:
1222
+ ms = merged["Vehicle Registration Numbers Mass"]
1223
+ ms["Registration Number"] = regs
1224
+ ms["Sub contractor"] = sub_contractors
1225
+ ms["Weight Verification Records"] = weight_verification
1226
+ ms["RFS Suspension Certification #"] = rfs_certification
1227
+ ms["Suspension System Maintenance"] = suspension_maintenance
1228
+ ms["Trip Records"] = trip_records
1229
+ ms["Fault Recording/ Reporting on Suspension System"] = fault_reporting_suspension
1230
+
1231
+ self.log_debug(f"Updated vehicle registration arrays for {len(regs)} vehicles")
1232
+ # ───────────────────────────── map to DOCX (apply spacing + safe fallbacks) ─────────────────────────────
1233
+ def map_to_docx_structure(self, pdf_extracted: Dict, docx_data: Dict, pdf_data: Dict) -> Dict:
1234
+ merged = json.loads(json.dumps(docx_data))
1235
+
1236
+ # Audit Information
1237
+ if "audit_info" in pdf_extracted and "Audit Information" in merged:
1238
+ ai = pdf_extracted["audit_info"]
1239
+ if ai.get("date_of_audit"):
1240
+ merged["Audit Information"]["Date of Audit"] = [_smart_space(ai["date_of_audit"])]
1241
+ if ai.get("location"):
1242
+ merged["Audit Information"]["Location of audit"] = [_smart_space(ai["location"])]
1243
+ if ai.get("auditor_name"):
1244
+ merged["Audit Information"]["Auditor name"] = [_smart_space(ai["auditor_name"])]
1245
+ if ai.get("matrix_id"):
1246
+ merged["Audit Information"]["Audit Matrix Identifier (Name or Number)"] = [_smart_space(ai["matrix_id"])]
1247
+
1248
+ # Operator Information
1249
+ if "operator_info" in pdf_extracted and "Operator Information" in merged:
1250
+ op = pdf_extracted["operator_info"]
1251
+ if op.get("name") and _looks_like_company(op["name"]):
1252
+ merged["Operator Information"]["Operator name (Legal entity)"] = [_smart_space(op["name"])]
1253
+ if op.get("trading_name"):
1254
+ merged["Operator Information"]["Registered trading name/s"] = [_smart_space(op["trading_name"])]
1255
+ if op.get("acn"):
1256
+ merged["Operator Information"]["Australian Company Number"] = [_smart_space(op["acn"])]
1257
+ if op.get("manual"):
1258
+ merged["Operator Information"]["NHVAS Manual (Policies and Procedures) developed by"] = [_smart_space(op["manual"])]
1259
+
1260
+ # Contact details
1261
+ if "operator_info" in pdf_extracted and "Operator contact details" in merged:
1262
+ op = pdf_extracted["operator_info"]
1263
+ if op.get("business_address"):
1264
+ merged["Operator contact details"]["Operator business address"] = [_smart_space(op["business_address"])]
1265
+ if op.get("postal_address"):
1266
+ merged["Operator contact details"]["Operator Postal address"] = [_smart_space(op["postal_address"])]
1267
+ if op.get("email"):
1268
+ merged["Operator contact details"]["Email address"] = [op["email"]]
1269
+ if op.get("phone"):
1270
+ merged["Operator contact details"]["Operator Telephone Number"] = [_smart_space(op["phone"])]
1271
+
1272
+ # Attendance
1273
+ if "attendance" in pdf_extracted and "Attendance List (Names and Position Titles)" in merged:
1274
+ merged["Attendance List (Names and Position Titles)"]["Attendance List (Names and Position Titles)"] = _clean_list(pdf_extracted["attendance"])
1275
+
1276
+ # Business summary
1277
+ if "business_summary" in pdf_extracted and "Nature of the Operators Business (Summary)" in merged:
1278
+ merged["Nature of the Operators Business (Summary)"]["Nature of the Operators Business (Summary):"] = [_smart_space(pdf_extracted["business_summary"])]
1279
+
1280
+ # Vehicle summary
1281
+ if "vehicle_summary" in pdf_extracted:
1282
+ vs = pdf_extracted["vehicle_summary"]
1283
+ if "Accreditation Vehicle Summary" in merged:
1284
+ if vs.get("powered_vehicles"):
1285
+ merged["Accreditation Vehicle Summary"]["Number of powered vehicles"] = [vs["powered_vehicles"]]
1286
+ if vs.get("trailing_vehicles"):
1287
+ merged["Accreditation Vehicle Summary"]["Number of trailing vehicles"] = [vs["trailing_vehicles"]]
1288
+ if "Accreditation Driver Summary" in merged:
1289
+ if vs.get("drivers_bfm"):
1290
+ merged["Accreditation Driver Summary"]["Number of drivers in BFM"] = [vs["drivers_bfm"]]
1291
+ if vs.get("drivers_afm"):
1292
+ merged["Accreditation Driver Summary"]["Number of drivers in AFM"] = [vs["drivers_afm"]]
1293
+
1294
+ # Summary sections (unchanged behavior)
1295
+ summary_maps = self.build_summary_maps(pdf_data)
1296
+ for section_name, std_map in summary_maps.items():
1297
+ if section_name in merged and std_map:
1298
+ for detail_key, details_list in std_map.items():
1299
+ if detail_key in merged[section_name]:
1300
+ merged[section_name][detail_key] = details_list
1301
+ continue
1302
+ for docx_key in list(merged[section_name].keys()):
1303
+ m1 = re.search(r"Std\s+(\d+)", detail_key)
1304
+ m2 = re.search(r"Std\s+(\d+)", docx_key)
1305
+ if m1 and m2 and m1.group(1) == m2.group(1):
1306
+ merged[section_name][docx_key] = details_list
1307
+ break
1308
+
1309
+ # Vehicle registration arrays via consolidated builder
1310
+ sections = build_vehicle_sections(pdf_extracted)
1311
+ if "Vehicle Registration Numbers Maintenance" in merged:
1312
+ merged["Vehicle Registration Numbers Maintenance"].update(
1313
+ sections["Vehicle Registration Numbers Maintenance"]
1314
+ )
1315
+ if "Vehicle Registration Numbers Mass" in merged:
1316
+ merged["Vehicle Registration Numbers Mass"].update(
1317
+ sections["Vehicle Registration Numbers Mass"]
1318
+ )
1319
+
1320
+
1321
+ # replace the whole Drivers/Scheduler block with:
1322
+ if "drivers_detailed" in pdf_extracted and "Driver / Scheduler Records Examined" in merged:
1323
+ drivers = pdf_extracted["drivers_detailed"]
1324
+
1325
+ def _looks_like_range(s):
1326
+ return bool(re.search(r"[0-9]{1,2}[/-]", s or ""))
1327
+
1328
+ merged["Driver / Scheduler Records Examined"]["Roster / Schedule / Safe Driving Plan (Date Range)"] = [d.get("roster_schedule","") for d in drivers]
1329
+ merged["Driver / Scheduler Records Examined"]["Fit for Duty Statement Completed (Yes/No)"] = [d.get("fit_for_duty","") for d in drivers]
1330
+ merged["Driver / Scheduler Records Examined"]["Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)"] = [d.get("work_diary","") for d in drivers]
1331
+
1332
+
1333
+ # --- Print accreditation name (robust, no UnboundLocalError) ---
1334
+ if "Print accreditation name" in merged:
1335
+ acc_name = "" # init
1336
+ acc_name = _smart_space(pdf_extracted.get("print_accreditation_name") or "")
1337
+ if not acc_name:
1338
+ oi = pdf_extracted.get("operator_info") or {}
1339
+ acc_name = _smart_space(oi.get("name") or "") or _smart_space(oi.get("trading_name") or "")
1340
+ if acc_name:
1341
+ merged["Print accreditation name"]["(print accreditation name)"] = [acc_name]
1342
+
1343
+ # Audit Declaration dates: prefer explicit extracted date; fallback to audit_info; ignore literal "Date"
1344
+ if "Audit Declaration dates" in merged:
1345
+ def _real_date(s: Optional[str]) -> bool:
1346
+ return bool(s and re.search(r"\d", s) and not re.fullmatch(r"date", s.strip(), re.I))
1347
+
1348
+ val = pdf_extracted.get("audit_conducted_date")
1349
+ if not _real_date(val):
1350
+ val = (pdf_extracted.get("audit_info", {}) or {}).get("date_of_audit")
1351
+
1352
+ if _real_date(val):
1353
+ merged["Audit Declaration dates"]["Audit was conducted on"] = [_smart_space(val)]
1354
+
1355
+
1356
+ # Operator Declaration: page 22 image missing → derive from first Attendance "Name - Title"
1357
+ if "Operator Declaration" in merged:
1358
+ # If an explicit operator declaration exists, use it
1359
+ if "operator_declaration" in pdf_extracted:
1360
+ od = pdf_extracted["operator_declaration"]
1361
+ pn = _smart_space(od.get("print_name", ""))
1362
+ pt = _smart_space(od.get("position_title", ""))
1363
+ if pn:
1364
+ merged["Operator Declaration"]["Print Name"] = [pn]
1365
+ if pt:
1366
+ merged["Operator Declaration"]["Position Title"] = [pt]
1367
+ else:
1368
+ # Fallback: first "Name - Title" from Attendance
1369
+ nt = self._first_attendance_name_title(pdf_extracted.get("attendance", []))
1370
+ if nt:
1371
+ merged["Operator Declaration"]["Print Name"] = [nt[0]]
1372
+ merged["Operator Declaration"]["Position Title"] = [nt[1]]
1373
+
1374
+
1375
+ # Paragraphs: fill company name for the 3 management headings; set the 2 dates
1376
+ if "paragraphs" in merged:
1377
+ paras = merged["paragraphs"]
1378
+
1379
+ audit_date = (
1380
+ pdf_extracted.get("audit_conducted_date")
1381
+ or pdf_extracted.get("audit_info", {}).get("date_of_audit")
1382
+ )
1383
+
1384
+ # Prefer accreditation name, else operator legal name, else trading name
1385
+ company_name = (
1386
+ _smart_space(pdf_extracted.get("print_accreditation_name") or "")
1387
+ or _smart_space(pdf_extracted.get("operator_info", {}).get("name") or "")
1388
+ or _smart_space(pdf_extracted.get("operator_info", {}).get("trading_name") or "")
1389
+ )
1390
+
1391
+ # Update the three layered headings
1392
+ for key in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
1393
+ if key in paras and company_name:
1394
+ paras[key] = [company_name]
1395
+
1396
+ # Second-last page: date under page heading
1397
+ if "NHVAS APPROVED AUDITOR DECLARATION" in paras and audit_date:
1398
+ paras["NHVAS APPROVED AUDITOR DECLARATION"] = [_smart_space(audit_date)]
1399
+
1400
+ # Last page: date under long acknowledgement paragraph
1401
+ ack_key = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
1402
+ "I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
1403
+ if ack_key in paras and audit_date:
1404
+ paras[ack_key] = [_smart_space(audit_date)]
1405
+
1406
+ self._force_fill_maintenance_from_tables(pdf_data, merged)
1407
+ return merged
1408
+
1409
+ # ───────────────────────────── merge & CLI (unchanged) ─────────────────────────────
1410
+ def merge_pdf_to_docx(self, docx_data: Dict, pdf_data: Dict) -> Dict:
1411
+ self.log_debug("Starting comprehensive PDF extraction...")
1412
+ pdf_extracted = self.extract_from_pdf_comprehensive(pdf_data)
1413
+ self.log_debug(f"Extracted PDF data keys: {list(pdf_extracted.keys())}")
1414
+
1415
+ self.log_debug("Mapping to DOCX structure...")
1416
+ merged_data = self.map_to_docx_structure(pdf_extracted, docx_data, pdf_data)
1417
+
1418
+ for section_name, section_data in docx_data.items():
1419
+ if isinstance(section_data, dict):
1420
+ for label in section_data:
1421
+ if (section_name in merged_data and
1422
+ label in merged_data[section_name] and
1423
+ merged_data[section_name][label] != docx_data[section_name][label]):
1424
+ print(f"✓ Updated {section_name}.{label}: {merged_data[section_name][label]}")
1425
+ return merged_data
1426
+
1427
+ def process_files(self, docx_file: str, pdf_file: str, output_file: str):
1428
+ try:
1429
+ print(f"Loading DOCX JSON from: {docx_file}")
1430
+ with open(docx_file, 'r', encoding='utf-8') as f:
1431
+ docx_data = json.load(f)
1432
+ print(f"Loading PDF JSON from: {pdf_file}")
1433
+ with open(pdf_file, 'r', encoding='utf-8') as f:
1434
+ pdf_data = json.load(f)
1435
+
1436
+ print("Merging PDF data into DOCX structure...")
1437
+ merged_data = self.merge_pdf_to_docx(docx_data, pdf_data)
1438
+
1439
+ print(f"Saving merged data to: {output_file}")
1440
+ with open(output_file, 'w', encoding='utf-8') as f:
1441
+ json.dump(merged_data, f, indent=2, ensure_ascii=False)
1442
+
1443
+ print("✅ Merge completed successfully!")
1444
+ return merged_data
1445
+ except Exception as e:
1446
+ print(f"❌ Error processing files: {str(e)}")
1447
+ import traceback
1448
+ traceback.print_exc()
1449
+ raise
1450
+
1451
+ def main():
1452
+ if len(sys.argv) != 4:
1453
+ print("Usage: python nhvas_merger.py <docx_json_file> <pdf_json_file> <output_file>")
1454
+ print("Example: python nhvas_merger.py docx_template.json pdf_extracted.json merged_output.json")
1455
+ sys.exit(1)
1456
+
1457
+ docx_file = sys.argv[1]
1458
+ pdf_file = sys.argv[2]
1459
+ output_file = sys.argv[3]
1460
+
1461
+ for file_path in [docx_file, pdf_file]:
1462
+ if not Path(file_path).exists():
1463
+ print(f"❌ File not found: {file_path}")
1464
+ sys.exit(1)
1465
+
1466
+ merger = NHVASMerger()
1467
+ merger.process_files(docx_file, pdf_file, output_file)
1468
+
1469
+ if __name__ == "__main__":
1470
+ main()
space-pdf/updated_word.py ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # update_docx_from_json.py
3
+ import sys, json, re
4
+ from pathlib import Path
5
+ from typing import Dict, List, Tuple, Optional
6
+ from docx import Document
7
+ from docx.shared import RGBColor, Pt # add Pt
8
+ from docx.table import _Cell, Table
9
+ from docx.text.paragraph import Paragraph
10
+ from copy import deepcopy
11
+ from docx.oxml.ns import qn
12
+ from docx.oxml.table import CT_Tbl
13
+ from docx.oxml.text.paragraph import CT_P
14
+
15
+ BLACK = RGBColor(0, 0, 0)
16
+ RED = RGBColor(0xFF, 0x00, 0x00)
17
+
18
+ # ----------------------------- text helpers -----------------------------
19
+ def _find_table_with_headers(doc: Document, must_have: list[str]) -> Optional[Table]:
20
+ for t in doc.tables:
21
+ if not t.rows:
22
+ continue
23
+ head = canon(" ".join(cell_text(c) for c in t.rows[0].cells))
24
+ if all(canon_label(x) in head for x in must_have):
25
+ return t
26
+ return None
27
+
28
+ def ensure_auditor_decl_headers(doc: Document) -> bool:
29
+ """
30
+ Second-last page table under 'NHVAS APPROVED AUDITOR DECLARATION'.
31
+ Force the HEADER row to read exactly:
32
+ [ Print Name | NHVR or Exemplar Global Auditor Registration Number ]
33
+ Never touch the bottom (values) row.
34
+ """
35
+ changed = False
36
+ expected_left = "Print Name"
37
+ expected_right = "NHVR or Exemplar Global Auditor Registration Number"
38
+
39
+ for t in doc.tables:
40
+ if not t.rows or not t.rows[0].cells:
41
+ continue
42
+ # must look like the auditor table: header left says "Print Name", 2+ cols, 2+ rows
43
+ head_left = canon_label(cell_text(t.rows[0].cells[0]))
44
+ if head_left == "print name" and len(t.rows[0].cells) >= 2 and len(t.rows) >= 2:
45
+ # fix left header if needed
46
+ if canon_label(cell_text(t.rows[0].cells[0])) != canon_label(expected_left) or \
47
+ any(is_red_run(r) for p in t.rows[0].cells[0].paragraphs for r in p.runs):
48
+ _set_cell_text_black(t.rows[0].cells[0], expected_left)
49
+ changed = True
50
+ # unconditionally set the RIGHT header text (this is where "Peter Sheppard" was sitting)
51
+ if canon_label(cell_text(t.rows[0].cells[1])) != canon_label(expected_right) or \
52
+ any(is_red_run(r) for p in t.rows[0].cells[1].paragraphs for r in p.runs):
53
+ _set_cell_text_black(t.rows[0].cells[1], expected_right)
54
+ changed = True
55
+ # found and fixed the table; no need to continue
56
+ break
57
+
58
+ return changed
59
+
60
+
61
+ def fill_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
62
+ """Last page table: write values ONLY into the bottom row (red placeholders)."""
63
+ t = _find_table_with_headers(doc, ["Print Name", "Position Title"])
64
+ if not t or len(t.rows) < 2 or len(t.rows[0].cells) < 2:
65
+ return False
66
+ bot_left = t.rows[1].cells[0]
67
+ bot_right = t.rows[1].cells[1]
68
+
69
+ # only replace if that cell has a red placeholder
70
+ if any(is_red_run(r) for p in bot_left.paragraphs for r in p.runs):
71
+ _set_cell_text_black(bot_left, print_name)
72
+ if any(is_red_run(r) for p in bot_right.paragraphs for r in p.runs):
73
+ _set_cell_text_black(bot_right, position_title)
74
+ return True
75
+
76
+ def find_heading_index_from_end(doc: Document, heading: str) -> Optional[int]:
77
+ key = canon(heading)
78
+ allp = iter_paragraphs(doc)
79
+ for i in range(len(allp) - 1, -1, -1):
80
+ if key in canon(para_text(allp[i])):
81
+ return i
82
+ return None
83
+
84
+ def set_date_by_heading_from_end(doc: Document, heading: str, date_text: str, max_scan: int = 60) -> bool:
85
+ """Find the LAST occurrence of `heading`, then replace the FIRST red run in the next paragraphs."""
86
+ if not date_text:
87
+ return False
88
+ allp = iter_paragraphs(doc)
89
+ idx = find_heading_index_from_end(doc, heading)
90
+ if idx is None:
91
+ return False
92
+ for p in allp[idx + 1 : min(idx + 1 + max_scan, len(allp))]:
93
+ if replace_red_in_paragraph(p, date_text): # writes in black
94
+ return True
95
+ return False
96
+
97
+ def set_date_by_paragraph_from_end(doc: Document, paragraph_text: str, date_text: str, max_scan: int = 60) -> bool:
98
+ """Find the LAST paragraph matching `paragraph_text`, then set the FIRST red run after it."""
99
+ if not date_text:
100
+ return False
101
+ key = canon(paragraph_text)
102
+ allp = iter_paragraphs(doc)
103
+ hit = None
104
+ for i in range(len(allp) - 1, -1, -1):
105
+ if key in canon(para_text(allp[i])):
106
+ hit = i
107
+ break
108
+ if hit is None:
109
+ return False
110
+ # date placeholder is on the LAST page, right after this long paragraph
111
+ for p in allp[hit + 1 : min(hit + 1 + max_scan, len(allp))]:
112
+ if replace_red_in_paragraph(p, date_text): # writes in black
113
+ return True
114
+ return False
115
+
116
+ def set_layer3_name_after_management_heading(doc: Document, mid_heading: str, allowed_prev_titles: List[str], name: str) -> bool:
117
+ if not name:
118
+ return False
119
+
120
+ allp = iter_paragraphs(doc)
121
+ wrote = False
122
+ mid = canon(mid_heading)
123
+ allowed_prev = {canon(t) for t in allowed_prev_titles}
124
+
125
+ for i, p in enumerate(allp):
126
+ if canon(para_text(p)) != mid:
127
+ continue
128
+
129
+ # previous non-empty must be one of the allowed titles
130
+ j = i - 1
131
+ while j >= 0 and not nz(para_text(allp[j])):
132
+ j -= 1
133
+ if j < 0 or canon(para_text(allp[j])) not in allowed_prev:
134
+ continue
135
+
136
+ # next non-empty is the 3rd line we overwrite
137
+ k = i + 1
138
+ while k < len(allp) and not nz(para_text(allp[k])):
139
+ k += 1
140
+ if k >= len(allp):
141
+ continue
142
+
143
+ # compute target size from the middle heading; fall back to a sensible bump
144
+ target_size = _para_effective_font_size(allp[i]) or Pt(16)
145
+
146
+ _clear_para_and_write_black(allp[k], name)
147
+
148
+ # apply size to all runs explicitly (overrides style)
149
+ for r in allp[k].runs:
150
+ r.font.size = target_size
151
+
152
+ wrote = True
153
+
154
+ return wrote
155
+
156
+ def _para_effective_font_size(p: Paragraph):
157
+ # try explicit run sizes first
158
+ for r in p.runs:
159
+ if r.font.size:
160
+ return r.font.size
161
+ # then the paragraph style
162
+ if p.style and p.style.font and p.style.font.size:
163
+ return p.style.font.size
164
+ return None
165
+
166
+ # --- helpers for summary tables ---
167
+ # --- helpers for summary overwrite ---
168
+ def _std_key(s: str) -> str:
169
+ """
170
+ Normalize a label to match a 'Std N' key.
171
+ e.g. 'Std 7. Internal Review' -> 'std 7'
172
+ """
173
+ t = canon_label(s)
174
+ m = re.match(r"(std\s+\d+)", t)
175
+ return m.group(1) if m else t
176
+
177
+ def _looks_like_summary_table(table: Table) -> Optional[Tuple[int, int]]:
178
+ """
179
+ Return (label_col_idx, details_col_idx) if this is a Summary table
180
+ with a DETAILS column; otherwise None.
181
+ """
182
+ if not table.rows:
183
+ return None
184
+ first = table.rows[0]
185
+ cols = len(first.cells)
186
+ if cols < 2:
187
+ return None
188
+
189
+ # header texts for first row
190
+ head = [canon(cell_text(c)) for c in first.cells]
191
+
192
+ # find DETAILS column
193
+ details_col = None
194
+ for j, t in enumerate(head):
195
+ if "detail" in t:
196
+ details_col = j
197
+ break
198
+ if details_col is None:
199
+ return None
200
+
201
+ # find the label column (left-hand standards column)
202
+ label_col = None
203
+ for j, t in enumerate(head):
204
+ if any(k in t for k in ["maintenance management", "mass management", "fatigue management"]):
205
+ label_col = j
206
+ break
207
+ if label_col is None:
208
+ # fallback: assume the first non-DETAILS column is the label column
209
+ label_col = 0 if details_col != 0 else 1
210
+
211
+ return (label_col, details_col)
212
+ def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
213
+ """Heuristically count header rows (stop when first data row like '1.' appears)."""
214
+ for i, row in enumerate(table.rows[:scan_up_to]):
215
+ first = cell_text(row.cells[0]).strip()
216
+ if re.match(r"^\d+\.?$", first):
217
+ return i
218
+ return 1
219
+ def _header_col_texts(table: Table, scan_rows: int = 5) -> List[str]:
220
+ scan_rows = min(scan_rows, len(table.rows))
221
+ if scan_rows == 0:
222
+ return []
223
+ # pick the row with the most cells as base
224
+ base_row = max(range(scan_rows), key=lambda i: len(table.rows[i].cells))
225
+ base_cols = len(table.rows[base_row].cells)
226
+ cols = []
227
+ for j in range(base_cols):
228
+ parts = []
229
+ for i in range(scan_rows):
230
+ row = table.rows[i]
231
+ if j < len(row.cells):
232
+ parts.append(cell_text(row.cells[j]))
233
+ cols.append(canon(" ".join(parts)))
234
+ return cols
235
+
236
+ def count_header_rows(table: Table, scan_up_to: int = 6) -> int:
237
+ """Header ends right before the first row whose 1st cell looks like '1.'"""
238
+ limit = min(scan_up_to, len(table.rows))
239
+ for i in range(limit):
240
+ first = cell_text(table.rows[i].cells[0]).strip()
241
+ if re.match(r"^\d+\.?$", first):
242
+ return i
243
+ # fallback to 1 header row
244
+ return 1
245
+
246
+ def map_cols_mass_strict(table: Table) -> Dict[str, int]:
247
+ cols = _header_col_texts(table, 5)
248
+ def first_col(*needles):
249
+ for j, t in enumerate(cols):
250
+ if all(n in t for n in needles):
251
+ return j
252
+ return None
253
+ idx = {
254
+ "no": first_col("no"),
255
+ "reg": first_col("registration", "number") or first_col("registration"),
256
+ "wv": first_col("weight", "verification"),
257
+ "rfs": first_col("rfs", "cert") or first_col("rfs", "certification"),
258
+ "susp": first_col("suspension", "maintenance"),
259
+ "trip": first_col("trip", "record"),
260
+ "frs": first_col("fault", "suspension") or first_col("fault", "reporting", "suspension"),
261
+ }
262
+ return {k: v for k, v in idx.items() if v is not None}
263
+
264
+ def find_mass_vehicle_numbers_table(doc: Document) -> Optional[Table]:
265
+ """Pick the Mass vehicle-number table by matching its column set (not the Summary table)."""
266
+ best = None
267
+ best_score = -1
268
+ for t in iter_tables(doc):
269
+ cols = _header_col_texts(t, 5)
270
+ allhdr = " ".join(cols)
271
+ # must look like the vehicle numbers table
272
+ hits = 0
273
+ hits += int(any("registration" in c and "number" in c for c in cols))
274
+ hits += int(any("weight" in c and "verification" in c for c in cols))
275
+ hits += int(any("rfs" in c and ("cert" in c or "certification" in c) for c in cols))
276
+ hits += int(any("suspension" in c and "maintenance" in c for c in cols))
277
+ hits += int(any("trip" in c and "record" in c for c in cols))
278
+ hits += int(any("fault" in c and "suspension" in c for c in cols))
279
+ # reject obvious Summary tables
280
+ if "details" in allhdr:
281
+ continue
282
+ # prefer tables with numbering column and many rows
283
+ score = hits + (0.5 if any("no" == c or c.startswith("no ") for c in cols) else 0) + (len(t.rows) / 100.0)
284
+ if hits >= 4 and score > best_score:
285
+ best, best_score = t, score
286
+ return best
287
+
288
+ def update_operator_declaration(doc: Document, print_name: str, position_title: str) -> bool:
289
+ """
290
+ First try strict table label mapping for 'Print Name' and 'Position Title'.
291
+ If not found, fallback to the first two red placeholders under the 'Operator Declaration' heading.
292
+ """
293
+ changed = False
294
+ # 1) Table label approach
295
+ for lbl, val in (("Print Name", print_name), ("Position Title", position_title)):
296
+ if not val:
297
+ continue
298
+ loc = find_label_cell(doc, lbl)
299
+ if not loc:
300
+ # tolerate odd spacing/colon/camelcase
301
+ for alt in ("PrintName", "Print Name", "Print Name:", "PositionTitle", "Position Title", "Position Title:"):
302
+ loc = find_label_cell(doc, alt)
303
+ if loc:
304
+ break
305
+ if loc:
306
+ t, r, c = loc
307
+ cell = get_adjacent_value_cell(t, r, c)
308
+ if not replace_red_in_cell(cell, val):
309
+ _set_cell_text_black(cell, val)
310
+ changed = True
311
+
312
+ if changed:
313
+ return True
314
+
315
+ # 2) Fallback: heading-scoped red placeholders
316
+ head = "OPERATOR DECLARATION"
317
+ p = find_heading_paragraph(doc, head) or find_heading_paragraph(doc, head.title())
318
+ if not p:
319
+ return False
320
+ allp = iter_paragraphs(doc)
321
+ try:
322
+ i = allp.index(p)
323
+ except ValueError:
324
+ i = 0
325
+ red_targets = []
326
+ for q in allp[i+1:i+1+20]:
327
+ reds = [r for r in q.runs if is_red_run(r)]
328
+ if reds:
329
+ red_targets.extend(reds)
330
+ if len(red_targets) >= 2:
331
+ break
332
+ wrote = False
333
+ if print_name and red_targets:
334
+ _set_text_and_black(red_targets[0], print_name); wrote = True
335
+ if position_title and len(red_targets) >= 2:
336
+ _set_text_and_black(red_targets[1], position_title); wrote = True
337
+ return wrote
338
+
339
+
340
+ def fill_mass_vehicle_table_preserve_headers(table: Table, arrays: Dict[str, List[str]]):
341
+ colmap = map_cols_mass_strict(table)
342
+ if "reg" not in colmap:
343
+ return
344
+ hdr_rows = count_header_rows(table, 6)
345
+ regs = arrays.get("Registration Number", [])
346
+ n = len(regs)
347
+
348
+ # clear data rows only
349
+ while len(table.rows) > hdr_rows:
350
+ table._tbl.remove(table.rows[-1]._tr)
351
+ # ensure enough rows
352
+ while len(table.rows) < hdr_rows + n:
353
+ table.add_row()
354
+
355
+ def put(row, key, arr_key, i):
356
+ if key in colmap:
357
+ vals = arrays.get(arr_key, [])
358
+ val = nz(vals[i]) if i < len(vals) else ""
359
+ replace_red_in_cell(row.cells[colmap[key]], val)
360
+
361
+ for i in range(n):
362
+ row = table.rows[hdr_rows + i]
363
+ replace_red_in_cell(row.cells[colmap["reg"]], nz(regs[i]))
364
+ put(row, "wv", "Weight Verification Records", i)
365
+ put(row, "rfs", "RFS Suspension Certification #", i)
366
+ put(row, "susp", "Suspension System Maintenance", i)
367
+ put(row, "trip", "Trip Records", i)
368
+ put(row, "frs", "Fault Recording/ Reporting on Suspension System", i)
369
+
370
+ def overwrite_summary_details_cells(doc: Document, section_name: str, section_dict: Dict[str, List[str]]) -> int:
371
+ """For a Summary table (Maintenance/Mass/Fatigue), replace the entire DETAILS cell
372
+ for each Std N row with the JSON text (written in black)."""
373
+ # build desired texts
374
+ desired: Dict[str, str] = { _std_key(k): join_value(v) for k, v in section_dict.items() }
375
+
376
+ # pick which tables belong to this section by header sniff
377
+ wanted_prefix = canon_label(section_name.split()[0]) # "maintenance" | "mass" | "fatigue"
378
+
379
+ updated = 0
380
+ for t in doc.tables:
381
+ cols = _looks_like_summary_table(t)
382
+ if not cols:
383
+ continue
384
+ label_col, details_col = cols
385
+
386
+ head_txt = table_header_text(t, up_to_rows=2)
387
+ if wanted_prefix not in head_txt: # keep to the correct section
388
+ continue
389
+
390
+ # walk body rows
391
+ for i in range(1, len(t.rows)):
392
+ row = t.rows[i]
393
+ key = _std_key(cell_text(row.cells[label_col]))
394
+
395
+ # exact match or "std N" prefix match
396
+ cand = desired.get(key)
397
+ if not cand:
398
+ m = re.match(r"(std\s+\d+)", key)
399
+ if m:
400
+ for k2, v2 in desired.items():
401
+ if k2.startswith(m.group(1)):
402
+ cand = v2
403
+ break
404
+ if not cand:
405
+ continue
406
+
407
+ _set_cell_text_black(row.cells[details_col], cand) # full overwrite, black
408
+ updated += 1
409
+ return updated
410
+
411
+ SPLIT_SENT_PAT = re.compile(r"(?<=\.|\?|!)\s+")
412
+ ORDINAL_DATE_PAT = re.compile(r"\b(\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4})\b", re.I)
413
+
414
+ def split_sentences_keep(text: str) -> List[str]:
415
+ s = " ".join(str(text or "").split())
416
+ if not s:
417
+ return []
418
+ out = []
419
+ start = 0
420
+ for m in SPLIT_SENT_PAT.finditer(s):
421
+ out.append(s[start:m.start()].strip())
422
+ start = m.end()
423
+ last = s[start:].strip()
424
+ if last:
425
+ out.append(last)
426
+ return out
427
+
428
+ _sent_split = re.compile(r'(?<=[.!?])\s+|\n+')
429
+ _date_pat = re.compile(r'\b(?:\d{1,2}(?:st|nd|rd|th)\s+[A-Za-z]+\s+\d{4}|\d{1,2}/\d{1,2}/\d{2,4}|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b')
430
+
431
+ def extract_summary_snippets(desired_text: str):
432
+ sents = _sentences(desired_text)
433
+ dates = [m.group(0) for m in _date_pat.finditer(desired_text)]
434
+ pick = lambda rx: next((s for s in sents if re.search(rx, s, re.I)), None)
435
+ return {
436
+ "sheet_sent": pick(r'\b(daily\s+check|sheet)\b'),
437
+ "sheet_phrase": _extract_sheet_phrase_from_desired(desired_text),
438
+ "review": pick(r'\binternal\s+review\b'),
439
+ "qcs": pick(r'\bquarterly\b.*\bcompliance\b') or pick(r'\bquarterly\b'),
440
+ "dates": dates,
441
+ "sents": sents,
442
+ }
443
+
444
+ def fill_management_summary_tables(doc: Document, section_key: str, section_data: Dict[str, List[str]]):
445
+ """
446
+ Fill ALL summary tables for the given section_key ('maintenance'|'mass'|'fatigue')
447
+ by matching each row label (left column) against keys in section_data and
448
+ patching only the red text inside the DETAILS cell.
449
+ """
450
+ targets = [x for x in find_all_summary_tables(doc) if x[0] == section_key]
451
+ if not targets:
452
+ return
453
+
454
+ # build list of (normalized label, original label, desired_text)
455
+ desired = []
456
+ for label, vals in section_data.items():
457
+ want = canon_label(label)
458
+ if not want:
459
+ continue
460
+ desired.append((want, label, join_value(vals)))
461
+
462
+ for _, table, lcol, dcol in targets:
463
+ # iterate data rows (skip header)
464
+ for i in range(1, len(table.rows)):
465
+ left_txt_norm = canon_label(cell_text(table.rows[i].cells[lcol]))
466
+ if not left_txt_norm:
467
+ continue
468
+ for want_norm, _orig_lbl, value in desired:
469
+ # loose contains match handles minor punctuation differences
470
+ if want_norm and want_norm in left_txt_norm:
471
+ patch_details_cell_from_json(table.rows[i].cells[dcol], value)
472
+
473
+ def _set_text_and_black(run, new_text: str):
474
+ """Replace a run's text and force color to black (clears theme color too)."""
475
+ if new_text is None:
476
+ new_text = ""
477
+ run.text = str(new_text)
478
+ run.font.color.rgb = BLACK
479
+ try:
480
+ # clear any theme color so rgb sticks
481
+ run.font.color.theme_color = None
482
+ except Exception:
483
+ pass
484
+
485
+ def update_business_summary_once(doc: Document, value) -> bool:
486
+ """Replace only the red summary paragraph; keep 'Accreditation Number' and 'Expiry Date' lines."""
487
+ loc = (find_label_cell(doc, "Nature of the Operators Business (Summary)")
488
+ or find_label_cell(doc, "Nature of the Operators Business (Summary):"))
489
+ if not loc:
490
+ return False
491
+
492
+ t, r, c = loc
493
+ cell = get_adjacent_value_cell(t, r, c)
494
+ if not cell.paragraphs:
495
+ cell.add_paragraph("")
496
+
497
+ txt = join_value(value)
498
+
499
+ # find paragraphs with any red runs (the placeholders for the summary)
500
+ red_paras = [p for p in cell.paragraphs if any(is_red_run(run) for run in p.runs)]
501
+
502
+ if red_paras:
503
+ # write the summary into the first red paragraph (in black)
504
+ _clear_para_and_write_black(red_paras[0], txt)
505
+ # clear any extra red placeholders
506
+ for p in red_paras[1:]:
507
+ _clear_para_and_write_black(p, "")
508
+ else:
509
+ # no red placeholder found: just put the summary into the first paragraph, leave others
510
+ _clear_para_and_write_black(cell.paragraphs[0], txt)
511
+
512
+ return True
513
+
514
+
515
+ def _nuke_cell_paragraphs(cell: _Cell):
516
+ """Remove ALL paragraphs from a cell (true delete, not just emptying runs)."""
517
+ for p in list(cell.paragraphs):
518
+ p._element.getparent().remove(p._element)
519
+
520
+ def _clear_para_and_write_black(paragraph, text: str):
521
+ """Clear a whole paragraph and write fresh black text."""
522
+ # wipe existing runs
523
+ for r in list(paragraph.runs):
524
+ r.text = ""
525
+ r = paragraph.add_run(str(text or ""))
526
+ r.font.color.rgb = BLACK
527
+ try:
528
+ r.font.color.theme_color = None
529
+ except Exception:
530
+ pass
531
+
532
+ def _set_cell_text_black(cell, text: str):
533
+ """Clear a table cell and insert black text."""
534
+ # remove text from all runs in all paragraphs
535
+ for p in cell.paragraphs:
536
+ for r in p.runs:
537
+ r.text = ""
538
+ p = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
539
+ r = p.add_run(str(text or ""))
540
+ r.font.color.rgb = BLACK
541
+ try:
542
+ r.font.color.theme_color = None
543
+ except Exception:
544
+ pass
545
+
546
+ def nz(x: Optional[str]) -> str:
547
+ return (x or "").strip()
548
+
549
+ def canon(s: str) -> str:
550
+ s = re.sub(r"\s+", " ", str(s)).strip().lower()
551
+ s = s.replace("–", "-").replace("—", "-")
552
+ return re.sub(r"[^a-z0-9/#()+,.\- ]+", "", s)
553
+
554
+ def canon_label(s: str) -> str:
555
+ # labels often vary by punctuation/casing; keep digits/letters
556
+ s = re.sub(r"\s+", " ", str(s)).strip().lower()
557
+ s = s.replace("–", "-").replace("—", "-")
558
+ s = re.sub(r"[^a-z0-9 ]+", " ", s)
559
+ return re.sub(r"\s+", " ", s).strip()
560
+
561
+ def join_value(value) -> str:
562
+ if isinstance(value, list):
563
+ # Keep multi-line when list provided
564
+ return "\n".join([str(v) for v in value if nz(v)])
565
+ return str(value)
566
+
567
+ def split_digits(s: str) -> List[str]:
568
+ return re.findall(r"\d", s)
569
+
570
+ def para_text(p: Paragraph) -> str:
571
+ return "".join(run.text for run in p.runs)
572
+
573
+ def cell_text(c: _Cell) -> str:
574
+ return "\n".join(para_text(p) for p in c.paragraphs)
575
+
576
+ def is_red_run(run) -> bool:
577
+ col = run.font.color
578
+ if not col:
579
+ return False
580
+ if col.rgb is not None:
581
+ return col.rgb == RED
582
+ # Some templates use theme colors; treat explicit red text snippets only
583
+ return False
584
+
585
+ def replace_red_in_paragraph(p: Paragraph, new_text: str) -> bool:
586
+ replaced = False
587
+ red_runs = [r for r in p.runs if is_red_run(r)]
588
+ if not red_runs:
589
+ return False
590
+ # collapse all red runs into one and write value (in black)
591
+ first = red_runs[0]
592
+ _set_text_and_black(first, new_text)
593
+ for r in red_runs[1:]:
594
+ r.text = ""
595
+ replaced = True
596
+ return replaced
597
+
598
+ def replace_red_in_cell(cell: _Cell, new_text: str) -> bool:
599
+ # replace only red runs; if none, replace whole cell with a single run (fallback)
600
+ any_red = False
601
+ for p in cell.paragraphs:
602
+ if replace_red_in_paragraph(p, new_text):
603
+ any_red = True
604
+ if any_red:
605
+ return True
606
+ # fallback: clear cell, set single paragraph text in black
607
+ _set_cell_text_black(cell, new_text)
608
+ return True
609
+
610
+ def parse_attendance_lines(value) -> List[str]:
611
+ """
612
+ Parse strings like:
613
+ "Peter Sheppard - Compliance Greg Dyer - Auditor"
614
+ into:
615
+ ["Peter Sheppard - Compliance", "Greg Dyer - Auditor"]
616
+ Handles lists, newlines, semicolons, and pipes too.
617
+ """
618
+ if isinstance(value, list):
619
+ s = " ".join(str(v) for v in value if v)
620
+ else:
621
+ s = str(value or "")
622
+ s = re.sub(r"\s+", " ", s).strip()
623
+ if not s:
624
+ return []
625
+
626
+ # First split on explicit separators; then within each chunk, extract Name - Title pairs.
627
+ chunks = re.split(r"\s*[\n;|]\s*", s)
628
+ items: List[str] = []
629
+
630
+ pair_pat = re.compile(
631
+ r"([A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3})\s*-\s*"
632
+ r"([^-\n]+?)(?=\s+[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){0,3}\s*-\s*|$)"
633
+ )
634
+
635
+ for chunk in chunks:
636
+ chunk = chunk.strip()
637
+ if not chunk:
638
+ continue
639
+ found = False
640
+ for m in pair_pat.finditer(chunk):
641
+ name = m.group(1).strip()
642
+ title = m.group(2).strip()
643
+ items.append(f"{name} - {title}")
644
+ found = True
645
+ if not found:
646
+ # Fallback: single "Name - Title"
647
+ if " - " in chunk:
648
+ a, b = chunk.split(" - ", 1)
649
+ items.append(f"{a.strip()} - {b.strip()}")
650
+ elif chunk:
651
+ items.append(chunk)
652
+
653
+ return items
654
+
655
+ def fill_attendance_block(doc: Document, value) -> bool:
656
+ items = parse_attendance_lines(value)
657
+ if not items:
658
+ return False
659
+
660
+ loc = find_label_cell(doc, "Attendance List (Names and Position Titles)")
661
+ if not loc:
662
+ return False
663
+
664
+ t, r, c = loc
665
+ # value cell: usually directly under the heading cell
666
+ target = (
667
+ t.rows[r + 1].cells[c]
668
+ if r + 1 < len(t.rows) and c < len(t.rows[r + 1].cells)
669
+ else get_adjacent_value_cell(t, r, c)
670
+ )
671
+
672
+ # ---- read ONLY the target cell (don’t touch the row)
673
+ def is_red_para(p): return any(is_red_run(run) for run in p.runs)
674
+ def looks_like_pair(s: str) -> bool:
675
+ if " - " not in s: return False
676
+ a, b = s.split(" - ", 1)
677
+ return bool(a.strip()) and bool(b.strip())
678
+
679
+ paras = list(target.paragraphs)
680
+ red_count = sum(1 for p in paras if is_red_para(p))
681
+ existing_black = [para_text(p).strip() for p in paras
682
+ if (not is_red_para(p)) and looks_like_pair(para_text(p))]
683
+
684
+ # compose final lines
685
+ out_lines: List[str] = []
686
+ out_lines.extend(items[:red_count]) # replace red placeholders
687
+ out_lines.extend(existing_black) # keep black lines
688
+ norm = lambda s: re.sub(r"\s+", " ", s.strip().lower())
689
+ seen = {norm(x) for x in out_lines}
690
+ for extra in items[red_count:]:
691
+ k = norm(extra)
692
+ if k not in seen:
693
+ out_lines.append(extra); seen.add(k)
694
+
695
+ # ---- hard clear target cell and write fresh (all black)
696
+ _nuke_cell_paragraphs(target)
697
+ # first line
698
+ p = target.add_paragraph()
699
+ _clear_para_and_write_black(p, out_lines[0] if out_lines else "")
700
+ # remaining lines
701
+ for line in out_lines[1:]:
702
+ p = target.add_paragraph()
703
+ _clear_para_and_write_black(p, line)
704
+
705
+ return True
706
+
707
+ # ----------------------------- document search -----------------------------
708
+ def iter_tables(doc: Document) -> List[Table]:
709
+ return list(doc.tables)
710
+
711
+ def iter_paragraphs(doc: Document) -> List[Paragraph]:
712
+ # paragraphs at doc level + inside tables
713
+ out = list(doc.paragraphs)
714
+ for t in doc.tables:
715
+ for row in t.rows:
716
+ for cell in row.cells:
717
+ out.extend(cell.paragraphs)
718
+ return out
719
+
720
+ def find_heading_paragraph(doc: Document, heading_text: str, window: int = 60) -> Optional[Paragraph]:
721
+ key = canon(heading_text)
722
+ for p in iter_paragraphs(doc):
723
+ if canon(para_text(p)).startswith(key):
724
+ return p
725
+ # fuzzy contains
726
+ for p in iter_paragraphs(doc):
727
+ if key in canon(para_text(p)):
728
+ return p
729
+ return None
730
+
731
+ def find_label_cell_in_table(table: Table, label: str) -> Optional[Tuple[int, int]]:
732
+ target = canon_label(label)
733
+ for r_i, row in enumerate(table.rows):
734
+ for c_i, cell in enumerate(row.cells):
735
+ if canon_label(cell_text(cell)) == target:
736
+ return (r_i, c_i)
737
+ # allow contains (safe-ish)
738
+ for r_i, row in enumerate(table.rows):
739
+ for c_i, cell in enumerate(row.cells):
740
+ if target and target in canon_label(cell_text(cell)):
741
+ return (r_i, c_i)
742
+ return None
743
+
744
+ def find_label_cell(doc: Document, label: str) -> Optional[Tuple[Table, int, int]]:
745
+ for t in iter_tables(doc):
746
+ pos = find_label_cell_in_table(t, label)
747
+ if pos:
748
+ return (t, pos[0], pos[1])
749
+ return None
750
+
751
+ def get_adjacent_value_cell(table: Table, r: int, c: int) -> _Cell:
752
+ # Prefer right cell, otherwise next row same col, otherwise this cell
753
+ cols = len(table.rows[0].cells)
754
+ if c + 1 < cols:
755
+ return table.rows[r].cells[c+1]
756
+ if r + 1 < len(table.rows):
757
+ return table.rows[r+1].cells[c]
758
+ return table.rows[r].cells[c]
759
+
760
+ # ----------------------------- label/value updates -----------------------------
761
+ def update_label_value_in_tables(doc: Document, label: str, value) -> bool:
762
+ tup = find_label_cell(doc, label)
763
+ val = join_value(value)
764
+ if not tup:
765
+ return False
766
+ t, r, c = tup
767
+ target_cell = get_adjacent_value_cell(t, r, c)
768
+ return replace_red_in_cell(target_cell, val)
769
+
770
+ def update_heading_followed_red(doc: Document, heading: str, value, max_scan: int = 12) -> bool:
771
+ """Find heading paragraph, then replace the first red run found within next N paragraphs (including inside tables)"""
772
+ start = find_heading_paragraph(doc, heading)
773
+ if not start:
774
+ return False
775
+ # Build a linear list of paragraphs across whole doc to get an index
776
+ allp = iter_paragraphs(doc)
777
+ try:
778
+ idx = allp.index(start)
779
+ except ValueError:
780
+ idx = 0
781
+ new_text = join_value(value)
782
+ # Scan forward
783
+ for p in allp[idx+1: idx+1+max_scan]:
784
+ if replace_red_in_paragraph(p, new_text):
785
+ return True
786
+ # Also check any red in table cells inside this paragraph's parent (already covered via iter_paragraphs)
787
+ return False
788
+
789
+ # ----------------------------- ACN per-digit fill -----------------------------
790
+ def fill_acn_digits(doc: Document, acn_value: str) -> bool:
791
+ digits = split_digits(acn_value)
792
+ if not digits:
793
+ return False
794
+ loc = find_label_cell(doc, "Australian Company Number")
795
+ if not loc:
796
+ return False
797
+
798
+ t, r, c = loc
799
+
800
+ # Collect cells to the RIGHT in the same row first
801
+ targets: List[_Cell] = [t.rows[r].cells[j] for j in range(c + 1, len(t.rows[r].cells))]
802
+
803
+ # If not enough, continue row-by-row below (left→right)
804
+ rr = r + 1
805
+ while len(targets) < len(digits) and rr < len(t.rows):
806
+ targets.extend(list(t.rows[rr].cells))
807
+ rr += 1
808
+
809
+ targets = targets[:len(digits)]
810
+ if not targets:
811
+ return False
812
+
813
+ # Clear each target cell and write ONE digit in black
814
+ for d, cell in zip(digits, targets):
815
+ _set_cell_text_black(cell, d)
816
+
817
+ return True
818
+
819
+
820
+ # ----------------------------- vehicle tables -----------------------------
821
+ def table_header_text(table: Table, up_to_rows: int = 3) -> str:
822
+ heads = []
823
+ for i, row in enumerate(table.rows[:up_to_rows]):
824
+ for cell in row.cells:
825
+ heads.append(cell_text(cell))
826
+ return canon(" ".join(heads))
827
+
828
+ def find_vehicle_table(doc: Document, want: str) -> Optional[Table]:
829
+ """
830
+ want = "maintenance" or "mass"
831
+ """
832
+ MAINT_KEYS = ["registration number", "maintenance records", "daily checks", "fault recording", "fault repair"]
833
+ MASS_KEYS = ["registration number", "weight verification", "rfs suspension", "suspension system maintenance", "trip records", "reporting on suspension"]
834
+ candidates = []
835
+ for t in iter_tables(doc):
836
+ htxt = table_header_text(t)
837
+ if want == "maintenance":
838
+ if all(k in htxt for k in ["registration", "maintenance", "fault"]) and "suspension" not in htxt:
839
+ candidates.append(t)
840
+ elif want == "mass":
841
+ if "suspension" in htxt and "weight" in htxt:
842
+ candidates.append(t)
843
+ # Prefer the one with most rows
844
+ if not candidates:
845
+ return None
846
+ return max(candidates, key=lambda tb: len(tb.rows))
847
+
848
+ def map_cols(table: Table, want: str) -> Dict[str, int]:
849
+ # map header columns by keywords from the first 2 rows that contain headers
850
+ header_rows = table.rows[:2]
851
+ col_texts = []
852
+ cols = len(table.rows[0].cells)
853
+ for j in range(cols):
854
+ txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
855
+ col_texts.append(canon(txt))
856
+ idx = {}
857
+ def first_col(*needles) -> Optional[int]:
858
+ for j, t in enumerate(col_texts):
859
+ if all(n in t for n in needles):
860
+ return j
861
+ return None
862
+ if want == "maintenance":
863
+ idx["reg"] = first_col("registration")
864
+ idx["rw"] = first_col("roadworthiness")
865
+ idx["mr"] = first_col("maintenance", "records")
866
+ idx["daily"] = first_col("daily", "check")
867
+ idx["fr"] = first_col("fault", "recording")
868
+ idx["rep"] = first_col("fault", "repair")
869
+ else:
870
+ idx["reg"] = first_col("registration")
871
+ idx["wv"] = first_col("weight", "verification")
872
+ idx["rfs"] = first_col("rfs", "cert")
873
+ idx["susp"] = first_col("suspension", "maintenance")
874
+ idx["trip"] = first_col("trip", "record")
875
+ idx["frs"] = first_col("fault", "suspension")
876
+ return {k:v for k,v in idx.items() if v is not None}
877
+
878
+ def clear_data_rows_keep_headers(table: Table, header_rows: int = 1):
879
+ # Keep first header_rows, drop everything else
880
+ while len(table.rows) > header_rows:
881
+ table._tbl.remove(table.rows[-1]._tr)
882
+
883
+ def ensure_rows(table: Table, need_rows: int):
884
+ # assumes 1 header row; add rows to reach need_rows + 1 total
885
+ while len(table.rows) < need_rows + 1:
886
+ table.add_row()
887
+
888
+ def fill_vehicle_table(table: Table, want: str, arrays: Dict[str, List[str]]):
889
+ colmap = map_cols(table, want)
890
+ if "reg" not in colmap:
891
+ return
892
+ if want == "maintenance":
893
+ regs = arrays.get("Registration Number", [])
894
+ rw = arrays.get("Roadworthiness Certificates", [])
895
+ mr = arrays.get("Maintenance Records", [])
896
+ daily= arrays.get("Daily Checks", [])
897
+ fr = arrays.get("Fault Recording/ Reporting", [])
898
+ rep = arrays.get("Fault Repair", [])
899
+ n = len(regs)
900
+ # keep header row(s), then fill N rows
901
+ clear_data_rows_keep_headers(table, header_rows=1)
902
+ ensure_rows(table, n)
903
+ for i in range(n):
904
+ row = table.rows[i+1]
905
+ def put(col_key, vals):
906
+ if col_key not in colmap or i >= len(vals): return
907
+ c = row.cells[colmap[col_key]]
908
+ replace_red_in_cell(c, nz(vals[i]))
909
+ # write each col
910
+ c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
911
+ put("rw", rw)
912
+ put("mr", mr)
913
+ put("daily",daily)
914
+ put("fr", fr)
915
+ put("rep", rep)
916
+ else:
917
+ regs = arrays.get("Registration Number", [])
918
+ wv = arrays.get("Weight Verification Records", [])
919
+ rfs = arrays.get("RFS Suspension Certification #", [])
920
+ susp = arrays.get("Suspension System Maintenance", [])
921
+ trip = arrays.get("Trip Records", [])
922
+ frs = arrays.get("Fault Recording/ Reporting on Suspension System", [])
923
+ n = len(regs)
924
+ clear_data_rows_keep_headers(table, header_rows=1)
925
+ ensure_rows(table, n)
926
+ for i in range(n):
927
+ row = table.rows[i+1]
928
+ def put(col_key, vals):
929
+ if col_key not in colmap or i >= len(vals): return
930
+ c = row.cells[colmap[col_key]]
931
+ replace_red_in_cell(c, nz(vals[i]))
932
+ c_reg = row.cells[colmap["reg"]]; replace_red_in_cell(c_reg, nz(regs[i]))
933
+ put("wv", wv)
934
+ put("rfs", rfs)
935
+ put("susp", susp)
936
+ put("trip", trip)
937
+ put("frs", frs)
938
+
939
+ # ----------------------------- driver table -----------------------------
940
+ def find_driver_table(doc: Document) -> Optional[Table]:
941
+ for t in iter_tables(doc):
942
+ h = table_header_text(t)
943
+ if "driver / scheduler" in h and ("fit for duty" in h or "work diary" in h):
944
+ return t
945
+ return None
946
+
947
+ def map_driver_cols(table: Table) -> Dict[str,int]:
948
+ header_rows = table.rows[:2]
949
+ cols = len(table.rows[0].cells)
950
+ col_texts = []
951
+ for j in range(cols):
952
+ txt = " ".join(cell_text(r.cells[j]) for r in header_rows if j < len(r.cells))
953
+ col_texts.append(canon(txt))
954
+ idx = {}
955
+ def first_col(*needles):
956
+ for j, t in enumerate(col_texts):
957
+ if all(n in t for n in needles):
958
+ return j
959
+ return None
960
+ idx["name"] = first_col("driver", "name")
961
+ idx["roster"]= first_col("roster", "safe")
962
+ idx["fit"] = first_col("fit for duty")
963
+ # Work diary might be split across two headers; match "work diary" OR "electronic work diary"
964
+ wd = first_col("work diary") or first_col("electronic work diary")
965
+ if wd is not None: idx["wd"] = wd
966
+ return {k:v for k,v in idx.items() if v is not None}
967
+
968
+ def fill_driver_table(table: Table, arrays: Dict[str, List[str]]):
969
+ colmap = map_driver_cols(table)
970
+ if not colmap:
971
+ return
972
+
973
+ names = arrays.get("Driver / Scheduler Name", [])
974
+ rosters = arrays.get("Roster / Schedule / Safe Driving Plan (Date Range)", [])
975
+ fit = arrays.get("Fit for Duty Statement Completed (Yes/No)", [])
976
+ wd = arrays.get("Work Diary Pages (Page Numbers) Electronic Work Diary Records (Date Range)", [])
977
+
978
+ n = max(len(rosters), len(fit), len(wd), len(names))
979
+ clear_data_rows_keep_headers(table, header_rows=1)
980
+ ensure_rows(table, n)
981
+
982
+ has_any_name = any(str(x).strip() for x in names)
983
+
984
+ for i in range(n):
985
+ row = table.rows[i+1]
986
+ if "name" in colmap and has_any_name:
987
+ replace_red_in_cell(row.cells[colmap["name"]], names[i] if i < len(names) else "")
988
+ if "roster" in colmap:
989
+ replace_red_in_cell(row.cells[colmap["roster"]], rosters[i] if i < len(rosters) else "")
990
+ if "fit" in colmap:
991
+ replace_red_in_cell(row.cells[colmap["fit"]], fit[i] if i < len(fit) else "")
992
+ if "wd" in colmap:
993
+ replace_red_in_cell(row.cells[colmap["wd"]], wd[i] if i < len(wd) else "")
994
+
995
+
996
+
997
+ # ----------------------------- main mapping -----------------------------
998
+ def flatten_simple_sections(data: Dict) -> Dict[str, str]:
999
+ """Collect simple label->single value mappings from top-level sections other than tables."""
1000
+ out = {}
1001
+ skip_sections = {
1002
+ "Vehicle Registration Numbers Maintenance",
1003
+ "Vehicle Registration Numbers Mass",
1004
+ "Driver / Scheduler Records Examined",
1005
+ "paragraphs",
1006
+ "Attendance List (Names and Position Titles)",
1007
+ "Nature of the Operators Business (Summary)",
1008
+ "Maintenance Management Summary",
1009
+ "Mass Management Summary",
1010
+ "Fatigue Management Summary",
1011
+ }
1012
+ for sec, kv in data.items():
1013
+ if sec in skip_sections: continue
1014
+ if not isinstance(kv, dict): continue
1015
+ for label, val in kv.items():
1016
+ out[f"{sec}::{label}"] = join_value(val)
1017
+ return out
1018
+
1019
+ def run(input_json: Path, template_docx: Path, output_docx: Path):
1020
+ with open(input_json, "r", encoding="utf-8") as f:
1021
+ data = json.load(f)
1022
+
1023
+ doc = Document(str(template_docx))
1024
+
1025
+ # 1) simple label/value tables
1026
+ simple = flatten_simple_sections(data)
1027
+
1028
+ # Map by (section::label). We try: (a) find exact label cell somewhere and write in the adjacent cell;
1029
+ # (b) if not found, search by heading then the next red run below the heading.
1030
+ for k, v in simple.items():
1031
+ # use the part after '::' as the label
1032
+ label = k.split("::", 1)[1] if "::" in k else k
1033
+
1034
+ # SPECIAL: skip ACN here; we'll fill per-digit later
1035
+ if canon_label(label) == "australian company number":
1036
+ continue
1037
+
1038
+ ok = update_label_value_in_tables(doc, label, v)
1039
+ if not ok:
1040
+ sec = k.split("::", 1)[0] if "::" in k else k
1041
+ update_heading_followed_red(doc, sec, v)
1042
+
1043
+
1044
+ # 2) paragraphs block
1045
+ paras = data.get("paragraphs", {})
1046
+
1047
+ # 2a) generic headings → replace next red (skip the 3 management headings here)
1048
+ # third-line headings above the three tables
1049
+ for head in ("MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"):
1050
+ name_val = join_value(paras.get(head, ""))
1051
+ if name_val:
1052
+ update_heading_followed_red(doc, head, name_val, max_scan=6)
1053
+
1054
+ # 2b) the 3-layer headings → overwrite the 3rd line only
1055
+ # second-last page: date under page heading
1056
+ aud_head = "NHVAS APPROVED AUDITOR DECLARATION"
1057
+ aud_date = join_value(paras.get(aud_head, ""))
1058
+ if aud_date:
1059
+ set_date_by_heading_from_end(doc, aud_head, aud_date, max_scan=40)
1060
+
1061
+ # last page: date under the long acknowledgement paragraph
1062
+ ack_head = ("I hereby acknowledge and agree with the findings detailed in this NHVAS Audit Summary Report. "
1063
+ "I have read and understand the conditions applicable to the Scheme, including the NHVAS Business Rules and Standards.")
1064
+ ack_date = join_value(paras.get(ack_head, ""))
1065
+ if ack_date:
1066
+ set_date_by_paragraph_from_end(doc, ack_head, ack_date, max_scan=40)
1067
+
1068
+ maint_name = join_value(paras.get("MAINTENANCE MANAGEMENT", ""))
1069
+ if maint_name:
1070
+ set_layer3_name_after_management_heading(
1071
+ doc,
1072
+ "MAINTENANCE MANAGEMENT",
1073
+ ["Vehicle Registration Numbers of Records Examined"],
1074
+ maint_name,
1075
+ )
1076
+
1077
+ mass_name = join_value(paras.get("MASS MANAGEMENT", ""))
1078
+ if mass_name:
1079
+ set_layer3_name_after_management_heading(
1080
+ doc,
1081
+ "MASS MANAGEMENT",
1082
+ ["Vehicle Registration Numbers of Records Examined"],
1083
+ mass_name,
1084
+ )
1085
+
1086
+ fat_name = join_value(paras.get("FATIGUE MANAGEMENT", ""))
1087
+ if fat_name:
1088
+ set_layer3_name_after_management_heading(
1089
+ doc,
1090
+ "FATIGUE MANAGEMENT",
1091
+ ["Driver / Scheduler Records Examined"],
1092
+ fat_name,
1093
+ )
1094
+
1095
+
1096
+ # 3) ACN digits
1097
+ op_info = data.get("Operator Information", {})
1098
+ acn_val = join_value(op_info.get("Australian Company Number", ""))
1099
+ if acn_val:
1100
+ fill_acn_digits(doc, acn_val)
1101
+
1102
+ # 4) Vehicle tables
1103
+ maint = data.get("Vehicle Registration Numbers Maintenance", {})
1104
+ mass = data.get("Vehicle Registration Numbers Mass", {})
1105
+ t_m = find_vehicle_table(doc, "maintenance")
1106
+ if t_m and maint:
1107
+ fill_vehicle_table(t_m, "maintenance", maint)
1108
+ t_ms = find_mass_vehicle_numbers_table(doc)
1109
+ if t_ms and mass:
1110
+ fill_mass_vehicle_table_preserve_headers(t_ms, mass)
1111
+
1112
+ # 5) Driver table
1113
+ drivers = data.get("Driver / Scheduler Records Examined", {})
1114
+ t_d = find_driver_table(doc)
1115
+ if t_d and drivers:
1116
+ fill_driver_table(t_d, drivers)
1117
+
1118
+ # 6) Special: Audit Declaration dates via heading
1119
+ decl = data.get("Audit Declaration dates", {})
1120
+ if decl.get("Audit was conducted on"):
1121
+ update_heading_followed_red(doc, "Audit was conducted on", decl["Audit was conducted on"])
1122
+
1123
+ # 7) Operator Declaration (last page, bottom row only), and fix Auditor table header
1124
+ op_decl = data.get("Operator Declaration", {})
1125
+ if op_decl:
1126
+ fill_operator_declaration(
1127
+ doc,
1128
+ join_value(op_decl.get("Print Name", "")),
1129
+ join_value(op_decl.get("Position Title", "")),
1130
+ )
1131
+
1132
+ # make sure the second-last page “NHVAS APPROVED AUDITOR DECLARATION” header row is labels
1133
+ ensure_auditor_decl_headers(doc)
1134
+
1135
+
1136
+ # 8) Attendance List
1137
+ # Attendance: replace red lines only
1138
+ atts = data.get("Attendance List (Names and Position Titles)", {})
1139
+ att_val = atts.get("Attendance List (Names and Position Titles)")
1140
+ if att_val:
1141
+ fill_attendance_block(doc, att_val)
1142
+
1143
+ # 9) Nature of the Operators Business (Summary): write once (no duplicates)
1144
+ biz = data.get("Nature of the Operators Business (Summary)", {})
1145
+ if biz:
1146
+ val = biz.get("Nature of the Operators Business (Summary):") or next(iter(biz.values()), "")
1147
+ if val:
1148
+ update_business_summary_once(doc, val)
1149
+
1150
+ # 10) Summary tables: FULL OVERWRITE of DETAILS from JSON
1151
+ mm_sum = data.get("Maintenance Management Summary", {})
1152
+ if mm_sum:
1153
+ overwrite_summary_details_cells(doc, "Maintenance Management Summary", mm_sum)
1154
+
1155
+ mass_sum = data.get("Mass Management Summary", {})
1156
+ if mass_sum:
1157
+ overwrite_summary_details_cells(doc, "Mass Management Summary", mass_sum)
1158
+
1159
+ fat_sum = data.get("Fatigue Management Summary", {})
1160
+ if fat_sum:
1161
+ overwrite_summary_details_cells(doc, "Fatigue Management Summary", fat_sum)
1162
+
1163
+
1164
+ doc.save(str(output_docx))
1165
+
1166
+ # ----------------------------- CLI -----------------------------
1167
+ if __name__ == "__main__":
1168
+ import sys
1169
+ from pathlib import Path
1170
+
1171
+ if len(sys.argv) != 4:
1172
+ print("Usage: python updated_word.py <json> <template.docx> <output.docx>")
1173
+ sys.exit(1)
1174
+
1175
+ a, b, c = map(Path, sys.argv[1:4])
1176
+ files = [a, b, c]
1177
+
1178
+ json_path = next((p for p in files if p.suffix.lower() == ".json"), None)
1179
+ docx_paths = [p for p in files if p.suffix.lower() == ".docx"]
1180
+
1181
+ if not json_path or len(docx_paths) < 2:
1182
+ print("Error: provide one .json and two .docx (template + output).")
1183
+ sys.exit(1)
1184
+
1185
+ # Template = the .docx that already exists; Output = the other .docx
1186
+ template_docx = next((p for p in docx_paths if p.exists()), docx_paths[0])
1187
+ output_docx = docx_paths[1] if docx_paths[0] == template_docx else docx_paths[0]
1188
+
1189
+ run(json_path, template_docx, output_docx)
src/adapters/__init__.py ADDED
File without changes
src/adapters/infrastructure/__init__.py ADDED
File without changes
src/adapters/infrastructure/format_conversion_service_adapter.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from domain.PdfImages import PdfImages
2
+ from domain.PdfSegment import PdfSegment
3
+ from ports.services.format_conversion_service import FormatConversionService
4
+ from adapters.infrastructure.format_converters.convert_table_to_html import extract_table_format
5
+ from adapters.infrastructure.format_converters.convert_formula_to_latex import extract_formula_format
6
+
7
+
8
+ class FormatConversionServiceAdapter(FormatConversionService):
9
+ def convert_table_to_html(self, pdf_images: PdfImages, segments: list[PdfSegment]) -> None:
10
+ extract_table_format(pdf_images, segments)
11
+
12
+ def convert_formula_to_latex(self, pdf_images: PdfImages, segments: list[PdfSegment]) -> None:
13
+ extract_formula_format(pdf_images, segments)
src/adapters/infrastructure/format_converters/__init__.py ADDED
File without changes
src/adapters/infrastructure/format_converters/convert_formula_to_latex.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL.Image import Image
2
+ from pix2tex.cli import LatexOCR
3
+ from domain.PdfImages import PdfImages
4
+ from domain.PdfSegment import PdfSegment
5
+ from pdf_token_type_labels import TokenType
6
+ import latex2mathml.converter
7
+
8
+
9
+ def has_arabic(text: str) -> bool:
10
+ return any("\u0600" <= char <= "\u06FF" or "\u0750" <= char <= "\u077F" for char in text)
11
+
12
+
13
+ def is_valid_latex(formula: str) -> bool:
14
+ try:
15
+ latex2mathml.converter.convert(formula)
16
+ return True
17
+ except Exception:
18
+ return False
19
+
20
+
21
+ def extract_formula_format(pdf_images: PdfImages, predicted_segments: list[PdfSegment]):
22
+ formula_segments = [segment for segment in predicted_segments if segment.segment_type == TokenType.FORMULA]
23
+ if not formula_segments:
24
+ return
25
+
26
+ model = LatexOCR()
27
+ model.args.temperature = 1e-8
28
+
29
+ for formula_segment in formula_segments:
30
+ if has_arabic(formula_segment.text_content):
31
+ continue
32
+ page_image: Image = pdf_images.pdf_images[formula_segment.page_number - 1]
33
+ left, top = formula_segment.bounding_box.left, formula_segment.bounding_box.top
34
+ right, bottom = formula_segment.bounding_box.right, formula_segment.bounding_box.bottom
35
+ left = int(left * pdf_images.dpi / 72)
36
+ top = int(top * pdf_images.dpi / 72)
37
+ right = int(right * pdf_images.dpi / 72)
38
+ bottom = int(bottom * pdf_images.dpi / 72)
39
+ formula_image = page_image.crop((left, top, right, bottom))
40
+ formula_result = model(formula_image)
41
+ if not is_valid_latex(formula_result):
42
+ continue
43
+ formula_segment.text_content = f"$${formula_result}$$"
src/adapters/infrastructure/format_converters/convert_table_to_html.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from domain.PdfImages import PdfImages
3
+ from domain.PdfSegment import PdfSegment
4
+ from pdf_token_type_labels import TokenType
5
+ from rapidocr import RapidOCR
6
+ from rapid_table import ModelType, RapidTable, RapidTableInput
7
+
8
+
9
+ def extract_table_format(pdf_images: PdfImages, predicted_segments: list[PdfSegment]):
10
+ table_segments = [segment for segment in predicted_segments if segment.segment_type == TokenType.TABLE]
11
+ if not table_segments:
12
+ return
13
+
14
+ input_args = RapidTableInput(model_type=ModelType["SLANETPLUS"])
15
+
16
+ ocr_engine = RapidOCR()
17
+ table_engine = RapidTable(input_args)
18
+
19
+ for table_segment in table_segments:
20
+ page_image: Image = pdf_images.pdf_images[table_segment.page_number - 1]
21
+ left, top = table_segment.bounding_box.left, table_segment.bounding_box.top
22
+ right, bottom = table_segment.bounding_box.right, table_segment.bounding_box.bottom
23
+ left = int(left * pdf_images.dpi / 72)
24
+ top = int(top * pdf_images.dpi / 72)
25
+ right = int(right * pdf_images.dpi / 72)
26
+ bottom = int(bottom * pdf_images.dpi / 72)
27
+ table_image = page_image.crop((left, top, right, bottom))
28
+ ori_ocr_res = ocr_engine(table_image)
29
+ if not ori_ocr_res.txts:
30
+ continue
31
+ ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
32
+ table_result = table_engine(table_image, ocr_results=ocr_results)
33
+ table_segment.text_content = table_result.pred_html
src/adapters/infrastructure/html_conversion_service_adapter.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+ from starlette.responses import Response
3
+
4
+ from domain.SegmentBox import SegmentBox
5
+ from ports.services.html_conversion_service import HtmlConversionService
6
+ from adapters.infrastructure.markup_conversion.pdf_to_markup_service_adapter import PdfToMarkupServiceAdapter
7
+ from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
8
+
9
+
10
+ class HtmlConversionServiceAdapter(HtmlConversionService, PdfToMarkupServiceAdapter):
11
+
12
+ def __init__(self):
13
+ PdfToMarkupServiceAdapter.__init__(self, OutputFormat.HTML)
14
+
15
+ def convert_to_html(
16
+ self,
17
+ pdf_content: bytes,
18
+ segments: list[SegmentBox],
19
+ extract_toc: bool = False,
20
+ dpi: int = 120,
21
+ output_file: Optional[str] = None,
22
+ ) -> Union[str, Response]:
23
+ return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)
src/adapters/infrastructure/markdown_conversion_service_adapter.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+ from starlette.responses import Response
3
+
4
+ from domain.SegmentBox import SegmentBox
5
+ from ports.services.markdown_conversion_service import MarkdownConversionService
6
+ from adapters.infrastructure.markup_conversion.pdf_to_markup_service_adapter import PdfToMarkupServiceAdapter
7
+ from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
8
+
9
+
10
+ class MarkdownConversionServiceAdapter(MarkdownConversionService, PdfToMarkupServiceAdapter):
11
+
12
+ def __init__(self):
13
+ PdfToMarkupServiceAdapter.__init__(self, OutputFormat.MARKDOWN)
14
+
15
+ def convert_to_markdown(
16
+ self,
17
+ pdf_content: bytes,
18
+ segments: list[SegmentBox],
19
+ extract_toc: bool = False,
20
+ dpi: int = 120,
21
+ output_file: Optional[str] = None,
22
+ ) -> Union[str, Response]:
23
+ return self.convert_to_format(pdf_content, segments, extract_toc, dpi, output_file)
src/adapters/infrastructure/markup_conversion/ExtractedImage.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class ExtractedImage(BaseModel):
5
+ image_data: bytes
6
+ filename: str
src/adapters/infrastructure/markup_conversion/Link.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from domain.SegmentBox import SegmentBox
3
+
4
+
5
+ class Link(BaseModel):
6
+ source_segment: SegmentBox
7
+ destination_segment: SegmentBox
8
+ text: str
src/adapters/infrastructure/markup_conversion/OutputFormat.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from enum import StrEnum
2
+
3
+
4
+ class OutputFormat(StrEnum):
5
+ HTML = "html"
6
+ MARKDOWN = "markdown"
src/adapters/infrastructure/markup_conversion/__init__.py ADDED
File without changes
src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import tempfile
3
+ import zipfile
4
+ import io
5
+ import json
6
+ from fitz import Page
7
+ from pathlib import Path
8
+ from typing import Optional, Union
9
+ from PIL.Image import Image
10
+ from pdf2image import convert_from_path
11
+ from starlette.responses import Response
12
+
13
+ from domain.SegmentBox import SegmentBox
14
+ from pdf_features.PdfFeatures import PdfFeatures
15
+ from pdf_features.PdfToken import PdfToken
16
+ from pdf_features.Rectangle import Rectangle
17
+ from pdf_token_type_labels.Label import Label
18
+ from pdf_token_type_labels.PageLabels import PageLabels
19
+ from pdf_token_type_labels.PdfLabels import PdfLabels
20
+ from pdf_token_type_labels.TokenType import TokenType
21
+
22
+ from adapters.infrastructure.markup_conversion.OutputFormat import OutputFormat
23
+ from adapters.infrastructure.markup_conversion.Link import Link
24
+ from adapters.infrastructure.markup_conversion.ExtractedImage import ExtractedImage
25
+
26
+
27
+ class PdfToMarkupServiceAdapter:
28
+ def __init__(self, output_format: OutputFormat):
29
+ self.output_format = output_format
30
+
31
+ def convert_to_format(
32
+ self,
33
+ pdf_content: bytes,
34
+ segments: list[SegmentBox],
35
+ extract_toc: bool = False,
36
+ dpi: int = 120,
37
+ output_file: Optional[str] = None,
38
+ ) -> Union[str, Response]:
39
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
40
+ temp_file.write(pdf_content)
41
+ temp_pdf_path = Path(temp_file.name)
42
+
43
+ try:
44
+ extracted_images: list[ExtractedImage] = [] if output_file else None
45
+ user_base_name = Path(output_file).stem if output_file else None
46
+
47
+ content = self._generate_content(temp_pdf_path, segments, extract_toc, dpi, extracted_images, user_base_name)
48
+
49
+ if output_file:
50
+ return self._create_zip_response(content, extracted_images, output_file, segments)
51
+
52
+ return content
53
+ finally:
54
+ if temp_pdf_path.exists():
55
+ temp_pdf_path.unlink()
56
+
57
+ def _create_zip_response(
58
+ self,
59
+ content: str,
60
+ extracted_images: list[ExtractedImage],
61
+ output_filename: str,
62
+ segments: list[SegmentBox],
63
+ ) -> Response:
64
+ zip_buffer = io.BytesIO()
65
+
66
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
67
+ zip_file.writestr(output_filename, content.encode("utf-8"))
68
+
69
+ if extracted_images:
70
+ base_name = Path(output_filename).stem
71
+ pictures_dir = f"{base_name}_pictures/"
72
+
73
+ for image in extracted_images:
74
+ zip_file.writestr(f"{pictures_dir}{image.filename}", image.image_data)
75
+
76
+ base_name = Path(output_filename).stem
77
+ segmentation_filename = f"{base_name}_segmentation.json"
78
+ segmentation_data = self._create_segmentation_json(segments)
79
+ zip_file.writestr(segmentation_filename, segmentation_data)
80
+
81
+ zip_buffer.seek(0)
82
+
83
+ zip_filename = f"{Path(output_filename).stem}.zip"
84
+ return Response(
85
+ content=zip_buffer.getvalue(),
86
+ media_type="application/zip",
87
+ headers={"Content-Disposition": f"attachment; filename={zip_filename}"},
88
+ )
89
+
90
+ def _create_segmentation_json(self, segments: list[SegmentBox]) -> str:
91
+ segmentation_data = []
92
+ for segment in segments:
93
+ segmentation_data.append(segment.to_dict())
94
+ return json.dumps(segmentation_data, indent=4, ensure_ascii=False)
95
+
96
+ def _create_pdf_labels_from_segments(self, vgt_segments: list[SegmentBox]) -> PdfLabels:
97
+ page_numbers = sorted(set(segment.page_number for segment in vgt_segments))
98
+ page_labels: list[PageLabels] = []
99
+ for page_number in page_numbers:
100
+ segments_in_page = [s for s in vgt_segments if s.page_number == page_number]
101
+ labels: list[Label] = []
102
+ for segment in segments_in_page:
103
+ rect = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
104
+ label = Label.from_rectangle(rect, TokenType.from_text(segment.type).get_index())
105
+ labels.append(label)
106
+ page_labels.append(PageLabels(number=page_number, labels=labels))
107
+ return PdfLabels(pages=page_labels)
108
+
109
+ def _find_closest_segment(self, bounding_box: Rectangle, segments: list[SegmentBox]) -> Optional[SegmentBox]:
110
+ if not segments:
111
+ return None
112
+
113
+ def intersection_key(segment: SegmentBox) -> float:
114
+ segment_rect = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
115
+ return bounding_box.get_intersection_percentage(segment_rect)
116
+
117
+ closest = max(segments, key=intersection_key)
118
+ max_intersection = intersection_key(closest)
119
+ if max_intersection > 0:
120
+ return closest
121
+
122
+ candidates = [s for s in segments if s.top > bounding_box.top]
123
+ if not candidates:
124
+ return None
125
+
126
+ def distance_key(segment: SegmentBox) -> tuple[float, float]:
127
+ vertical_dist = segment.top - bounding_box.top
128
+ segment_center_x = segment.left + segment.width / 2
129
+ box_center_x = bounding_box.left + bounding_box.width / 2
130
+ horizontal_dist = abs(segment_center_x - box_center_x)
131
+ return (vertical_dist, horizontal_dist)
132
+
133
+ return min(candidates, key=distance_key)
134
+
135
+ def _get_link_segments(
136
+ self, link: dict, page: Page, segments_by_page: dict[int, list[SegmentBox]]
137
+ ) -> Optional[tuple[SegmentBox, SegmentBox]]:
138
+ rect = link["from"]
139
+ source_box = Rectangle.from_coordinates(rect[0], rect[1], rect[2], rect[3])
140
+ source_page_num = page.number + 1
141
+ source_segments = segments_by_page.get(source_page_num, [])
142
+ source_segment = self._find_closest_segment(source_box, source_segments)
143
+ if not source_segment:
144
+ return None
145
+
146
+ dest_page_num = link.get("page", -1) + 1
147
+ dest_segments = segments_by_page.get(dest_page_num, [])
148
+ if not dest_segments:
149
+ return None
150
+
151
+ if "to" not in link:
152
+ dest_box = Rectangle.from_coordinates(0, 0, 20, 20)
153
+ else:
154
+ dest = link["to"] * page.transformation_matrix
155
+ dest_box = Rectangle.from_coordinates(dest[0], dest[1], dest[0] + 20, dest[1] + 20)
156
+
157
+ dest_segment = self._find_closest_segment(dest_box, dest_segments)
158
+ if not dest_segment:
159
+ return None
160
+
161
+ return source_segment, dest_segment
162
+
163
+ def _extract_links_by_segments(
164
+ self, pdf_path: Path, vgt_segments: list[SegmentBox]
165
+ ) -> tuple[dict[SegmentBox, list[Link]], dict[SegmentBox, list[Link]]]:
166
+ links_by_source: dict[SegmentBox, list[Link]] = {}
167
+ links_by_dest: dict[SegmentBox, list[Link]] = {}
168
+
169
+ segments_by_page: dict[int, list[SegmentBox]] = {}
170
+ for segment in vgt_segments:
171
+ segments_by_page.setdefault(segment.page_number, []).append(segment)
172
+
173
+ doc = fitz.open(pdf_path)
174
+ try:
175
+ for page_num in range(len(doc)):
176
+ page: Page = doc[page_num]
177
+ links = page.get_links()
178
+ for link in links:
179
+ if "page" not in link:
180
+ continue
181
+ rect = link["from"]
182
+ text = page.get_text("text", clip=rect).strip()
183
+ if not text:
184
+ continue
185
+ segments_pair = self._get_link_segments(link, page, segments_by_page)
186
+ if not segments_pair:
187
+ continue
188
+ source, dest = segments_pair
189
+ new_link = Link(source_segment=source, destination_segment=dest, text=text)
190
+ links_by_source.setdefault(source, []).append(new_link)
191
+ links_by_dest.setdefault(dest, []).append(new_link)
192
+ finally:
193
+ doc.close()
194
+
195
+ return links_by_source, links_by_dest
196
+
197
+ def _insert_reference_links(self, segment_text: str, links: list[Link]) -> str:
198
+ offset = 0
199
+ for link in links:
200
+ start_idx = segment_text.find(link.text, offset)
201
+ if start_idx == -1:
202
+ continue
203
+ escaped_text = link.text.replace("[", "\\[").replace("]", "\\]")
204
+ md_link = f"[{escaped_text}](#{link.destination_segment.id})"
205
+ segment_text = segment_text[:start_idx] + md_link + segment_text[start_idx + len(link.text) :]
206
+ offset = start_idx + len(md_link)
207
+ return segment_text
208
+
209
+ def _process_picture_segment(
210
+ self,
211
+ segment: SegmentBox,
212
+ pdf_images: list[Image],
213
+ pdf_path: Path,
214
+ picture_id: int,
215
+ dpi: int = 72,
216
+ extracted_images: Optional[list[ExtractedImage]] = None,
217
+ user_base_name: Optional[str] = None,
218
+ ) -> str:
219
+
220
+ if extracted_images is None:
221
+ return ""
222
+
223
+ segment_box = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
224
+ image = pdf_images[segment.page_number - 1]
225
+ left, top, right, bottom = segment_box.left, segment_box.top, segment_box.right, segment_box.bottom
226
+ if dpi != 72:
227
+ left = left * dpi / 72
228
+ top = top * dpi / 72
229
+ right = right * dpi / 72
230
+ bottom = bottom * dpi / 72
231
+ cropped = image.crop((left, top, right, bottom))
232
+
233
+ base_name = user_base_name if user_base_name else pdf_path.stem
234
+ image_name = f"{base_name}_{segment.page_number}_{picture_id}.png"
235
+
236
+ img_buffer = io.BytesIO()
237
+ cropped.save(img_buffer, format="PNG")
238
+ extracted_images.append(ExtractedImage(image_data=img_buffer.getvalue(), filename=image_name))
239
+ return f"<span id='{segment.id}'></span>\n" + f"<img src='{base_name}_pictures/{image_name}' alt=''>\n\n"
240
+
241
+ def _process_table_segment(self, segment: SegmentBox) -> str:
242
+ return f"<span id='{segment.id}'></span>\n" + segment.text + "\n\n"
243
+
244
+ def _get_token_content(self, token: PdfToken) -> str:
245
+ if self.output_format == OutputFormat.HTML:
246
+ return token.content_html
247
+ else:
248
+ return token.content_markdown
249
+
250
+ def _get_styled_content(self, token: PdfToken, content: str) -> str:
251
+ if self.output_format == OutputFormat.HTML:
252
+ styled = token.token_style.get_styled_content_html(content)
253
+ styled = token.token_style.script_type.get_styled_content(styled)
254
+ styled = token.token_style.list_level.get_styled_content_html(styled)
255
+ return token.token_style.hyperlink_style.get_styled_content_html(styled)
256
+ else:
257
+ styled = token.token_style.get_styled_content_markdown(content)
258
+ styled = token.token_style.script_type.get_styled_content(styled)
259
+ styled = token.token_style.list_level.get_styled_content_markdown(styled)
260
+ return token.token_style.hyperlink_style.get_styled_content_markdown(styled)
261
+
262
+ def _process_title_segment(self, tokens: list[PdfToken], segment: SegmentBox) -> str:
263
+ if not tokens:
264
+ return ""
265
+
266
+ title_type = tokens[0].token_style.title_type
267
+ content = " ".join([self._get_styled_content(token, token.content) for token in tokens])
268
+ if self.output_format == OutputFormat.HTML:
269
+ content = title_type.get_styled_content_html(content)
270
+ else:
271
+ content = title_type.get_styled_content_markdown(content)
272
+ anchor = f"<span id='{segment.id}'></span>\n"
273
+ return anchor + content + "\n\n"
274
+
275
+ def _process_regular_segment(
276
+ self,
277
+ tokens: list[PdfToken],
278
+ segment: SegmentBox,
279
+ links_by_source: dict[SegmentBox, list[Link]],
280
+ links_by_dest: dict[SegmentBox, list[Link]],
281
+ ) -> str:
282
+ if not tokens:
283
+ return ""
284
+ content = " ".join(self._get_token_content(t) for t in tokens)
285
+ if segment in links_by_source:
286
+ content = self._insert_reference_links(content, links_by_source[segment])
287
+ if segment in links_by_dest:
288
+ content = f"<span id='{segment.id}'></span>\n" + content
289
+ return content + "\n\n"
290
+
291
+ def _get_table_of_contents(self, vgt_segments: list[SegmentBox]) -> str:
292
+ title_segments = [s for s in vgt_segments if s.type in {TokenType.TITLE, TokenType.SECTION_HEADER}]
293
+ table_of_contents = "# Table of Contents\n\n"
294
+ for segment in title_segments:
295
+ if not segment.text.strip():
296
+ continue
297
+ first_word = segment.text.split()[0]
298
+ indentation = max(0, first_word.count(".") - 1)
299
+ content = " " * indentation + "- [" + segment.text + "](#" + segment.id + ")\n"
300
+ table_of_contents += content
301
+ table_of_contents += "\n"
302
+ return table_of_contents + "\n\n"
303
+
304
+ def _set_segment_ids(self, vgt_segments: list[SegmentBox]) -> None:
305
+ segments_by_page: dict[int, list[SegmentBox]] = {}
306
+ for segment in vgt_segments:
307
+ segments_by_page.setdefault(segment.page_number, []).append(segment)
308
+ for page_number, segments in segments_by_page.items():
309
+ for segment_index, segment in enumerate(segments):
310
+ segment.id = f"page-{page_number}-{segment_index}"
311
+
312
+ def _generate_content(
313
+ self,
314
+ pdf_path: Path,
315
+ vgt_segments: list[SegmentBox],
316
+ extract_toc: bool = False,
317
+ dpi: int = 120,
318
+ extracted_images: Optional[list[ExtractedImage]] = None,
319
+ user_base_name: Optional[str] = None,
320
+ ) -> str:
321
+ pdf_labels: PdfLabels = self._create_pdf_labels_from_segments(vgt_segments)
322
+ pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path)
323
+ pdf_features.set_token_types(pdf_labels)
324
+ pdf_features.set_token_styles()
325
+
326
+ self._set_segment_ids(vgt_segments)
327
+ content_parts: list[str] = []
328
+ if extract_toc:
329
+ content_parts.append(self._get_table_of_contents(vgt_segments))
330
+
331
+ links_by_source, links_by_dest = self._extract_links_by_segments(pdf_path, vgt_segments)
332
+
333
+ picture_segments = [s for s in vgt_segments if s.type == TokenType.PICTURE]
334
+ pdf_images: list[Image] = convert_from_path(pdf_path, dpi=dpi) if picture_segments else []
335
+
336
+ for page in pdf_features.pages:
337
+ segments_in_page = [s for s in vgt_segments if s.page_number == page.page_number]
338
+ picture_id = 0
339
+ for segment in segments_in_page:
340
+ seg_box = Rectangle.from_width_height(segment.left, segment.top, segment.width, segment.height)
341
+ tokens_in_seg = [t for t in page.tokens if t.bounding_box.get_intersection_percentage(seg_box) > 50]
342
+
343
+ if segment.type == TokenType.PICTURE:
344
+ content_parts.append(
345
+ self._process_picture_segment(
346
+ segment, pdf_images, pdf_path, picture_id, dpi, extracted_images, user_base_name
347
+ )
348
+ )
349
+ picture_id += 1
350
+ elif segment.type == TokenType.TABLE:
351
+ content_parts.append(self._process_table_segment(segment))
352
+ elif segment.type in {TokenType.TITLE, TokenType.SECTION_HEADER}:
353
+ content_parts.append(self._process_title_segment(tokens_in_seg, segment))
354
+ elif segment.type == TokenType.FORMULA:
355
+ content_parts.append(segment.text + "\n\n")
356
+ else:
357
+ content_parts.append(
358
+ self._process_regular_segment(tokens_in_seg, segment, links_by_source, links_by_dest)
359
+ )
360
+
361
+ return "".join(content_parts)
src/adapters/infrastructure/ocr/__init__.py ADDED
File without changes
src/adapters/infrastructure/ocr/languages.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ iso_to_tesseract = {
4
+ "af": "afr", # Afrikaans
5
+ "all": "all", # Allar
6
+ "am": "amh", # Amharic
7
+ "ar": "ara", # Arabic
8
+ "as": "asm", # Assamese
9
+ "az": "aze", # Azerbaijani
10
+ "aze-cyrl": "aze-cyrl", # Azerbaijani (Cyrillic)
11
+ "be": "bel", # Belarusian
12
+ "bn": "ben", # Bangla
13
+ "bo": "bod", # Tibetan
14
+ "bs": "bos", # Bosnian
15
+ "br": "bre", # Breton
16
+ "bg": "bul", # Bulgarian
17
+ "ca": "cat", # Catalan
18
+ "ceb": "ceb", # Cebuano
19
+ "cs": "ces", # Czech
20
+ "zh-Hans": "chi_sim", # Chinese (Simplified)
21
+ "chi-sim-vert": "chi-sim-vert", # Chinese (Simplified) vertical
22
+ "zh-Hant": "chi_tra", # Chinese (Traditional)
23
+ "chi-tra-vert": "chi-tra-vert", # Chinese (Traditional) vertical
24
+ "chr": "chr", # Cherokee
25
+ "co": "cos", # Corsican
26
+ "cy": "cym", # Welsh
27
+ "da": "dan", # Danish
28
+ "de": "deu", # German
29
+ "dv": "div", # Divehi
30
+ "dz": "dzo", # Dzongkha
31
+ "el": "ell", # Greek
32
+ "en": "eng", # English
33
+ "enm": "enm", # Middle English
34
+ "eo": "epo", # Esperanto
35
+ "et": "est", # Estonian
36
+ "eu": "eus", # Basque
37
+ "fo": "fao", # Faroese
38
+ "fa": "fas", # Persian
39
+ "fil": "fil", # Filipino
40
+ "fi": "fin", # Finnish
41
+ "fr": "fra", # French
42
+ "frk": "frk", # Frankish
43
+ "frm": "frm", # Middle French
44
+ "fy": "fry", # Western Frisian
45
+ "gd": "gla", # Scottish Gaelic
46
+ "ga": "gle", # Irish
47
+ "gl": "glg", # Galician
48
+ "grc": "grc", # Ancient Greek
49
+ "gu": "guj", # Gujarati
50
+ "ht": "hat", # Haitian Creole
51
+ "he": "heb", # Hebrew
52
+ "hi": "hin", # Hindi
53
+ "hr": "hrv", # Croatian
54
+ "hu": "hun", # Hungarian
55
+ "hy": "hye", # Armenian
56
+ "iu": "iku", # Inuktitut
57
+ "id": "ind", # Indonesian
58
+ "is": "isl", # Icelandic
59
+ "it": "ita", # Italian
60
+ "ita-old": "ita-old", # Old Italian
61
+ "jv": "jav", # Javanese
62
+ "ja": "jpn", # Japanese
63
+ "jpn-vert": "jpn-vert", # Japanese vertical
64
+ "kn": "kan", # Kannada
65
+ "ka": "kat", # Georgian
66
+ "kat-old": "kat-old", # Old Georgian
67
+ "kk": "kaz", # Kazakh
68
+ "km": "khm", # Khmer
69
+ "ky": "kir", # Kyrgyz
70
+ "kmr": "kmr", # Northern Kurdish
71
+ "ko": "kor", # Korean
72
+ "kor-vert": "kor_vert", # Korean vertical
73
+ "lo": "lao", # Lao
74
+ "la": "lat", # Latin
75
+ "lv": "lav", # Latvian
76
+ "lt": "lit", # Lithuanian
77
+ "lb": "ltz", # Luxembourgish
78
+ "ml": "mal", # Malayalam
79
+ "mr": "mar", # Marathi
80
+ "mk": "mkd", # Macedonian
81
+ "mt": "mlt", # Maltese
82
+ "mn": "mon", # Mongolian
83
+ "mi": "mri", # Māori
84
+ "ms": "msa", # Malay
85
+ "my": "mya", # Burmese
86
+ "ne": "nep", # Nepali
87
+ "nl": "nld", # Dutch
88
+ "no": "nor", # Norwegian
89
+ "oc": "oci", # Occitan
90
+ "or": "ori", # Odia
91
+ "osd": "osd", # Unknown language [osd]
92
+ "pa": "pan", # Punjabi
93
+ "pl": "pol", # Polish
94
+ "pt": "por", # Portuguese
95
+ "ps": "pus", # Pashto
96
+ "qu": "que", # Quechua
97
+ "ro": "ron", # Romanian
98
+ "ru": "rus", # Russian
99
+ "sa": "san", # Sanskrit
100
+ "script-arab": "script-arab", # Arabic script
101
+ "script-armn": "script-armn", # Armenian script
102
+ "script-beng": "script-beng", # Bengali script
103
+ "script-cans": "script-cans", # Canadian Aboriginal script
104
+ "script-cher": "script-cher", # Cherokee script
105
+ "script-cyrl": "script-cyrl", # Cyrillic script
106
+ "script-deva": "script-deva", # Devanagari script
107
+ "script-ethi": "script-ethi", # Ethiopic script
108
+ "script-frak": "script-frak", # Frankish script
109
+ "script-geor": "script-geor", # Georgian script
110
+ "script-grek": "script-grek", # Greek script
111
+ "script-gujr": "script-gujr", # Gujarati script
112
+ "script-guru": "script-guru", # Gurmukhi script
113
+ "script-hang": "script-hang", # Hangul script
114
+ "script-hang-vert": "script-hang-vert", # Hangul script vertical
115
+ "script-hans": "script-hans",
116
+ "script-hans-vert": "script-hans-vert",
117
+ "script-hant": "script-hant",
118
+ "script-hant-vert": "script-hant-vert",
119
+ "script-hebr": "script-hebr", # Hebrew script
120
+ "script-jpan": "script-jpan", # Japanese script
121
+ "script-jpan-vert": "script-jpan-vert", # Japanese script vertical
122
+ "script-khmr": "script-khmr", # Khmer script
123
+ "script-knda": "script-knda", # Kannada script
124
+ "script-laoo": "script-laoo", # Lao script
125
+ "script-latn": "script-latn",
126
+ "script-mlym": "script-mlym", # Malayalam script
127
+ "script-mymr": "script-mymr", # Myanmar script
128
+ "script-orya": "script-orya", # Odia script
129
+ "script-sinh": "script-sinh", # Sinhala script
130
+ "script-syrc": "script-syrc", # Syriac script
131
+ "script-taml": "script-taml", # Tamil script
132
+ "script-telu": "script-telu", # Telugu script
133
+ "script-thaa": "script-thaa", # Thaana script
134
+ "script-thai": "script-thai", # Thai script
135
+ "script-tibt": "script-tibt", # Tibetan script
136
+ "script-viet": "script-viet", # Vietnamese script
137
+ "si": "sin", # Sinhala
138
+ "sk": "slk", # Slovak
139
+ "sl": "slv", # Slovenian
140
+ "sd": "snd", # Sindhi
141
+ "es": "spa", # Spanish
142
+ "spa-old": "spa-old", # Old Spanish
143
+ "sq": "sqi", # Albanian
144
+ "sr": "srp", # Serbian
145
+ "srp-latn": "srp-latn", # Serbian (Latin)
146
+ "su": "sun", # Sundanese
147
+ "sw": "swa", # Swahili
148
+ "sv": "swe", # Swedish
149
+ "syr": "syr", # Syriac
150
+ "ta": "tam", # Tamil
151
+ "tt": "tat", # Tatar
152
+ "te": "tel", # Telugu
153
+ "tg": "tgk", # Tajik
154
+ "th": "tha", # Thai
155
+ "ti": "tir", # Tigrinya
156
+ "to": "ton", # Tongan
157
+ "tr": "tur", # Turkish
158
+ "ug": "uig", # Uyghur
159
+ "uk": "ukr", # Ukrainian
160
+ "ur": "urd", # Urdu
161
+ "uz": "uzb", # Uzbek
162
+ "uzb-cyrl": "uzb-cyrl", # Uzbek (Cyrillic)
163
+ "vi": "vie", # Vietnamese
164
+ "yi": "yid", # Yiddish
165
+ "yo": "yor", # Yoruba
166
+ }
167
+
168
+
169
+ def supported_languages():
170
+ cmd = "tesseract --list-langs | grep -v osd | awk '{if(NR>1)print}'"
171
+ sp = subprocess.Popen(["/bin/bash", "-c", cmd], stdout=subprocess.PIPE)
172
+ tesseract_langs = [line.strip().decode("utf-8") for line in sp.stdout.readlines()]
173
+ inverted_iso_dict = {v: k for k, v in iso_to_tesseract.items()}
174
+ return list({tesseract_key: inverted_iso_dict[tesseract_key] for tesseract_key in tesseract_langs}.values())