Spaces:
Running
Running
Commit
·
ae5ac9c
1
Parent(s):
adf4200
Adding Docling
Browse files- app.py +19 -3
- requirements.txt +55 -0
app.py
CHANGED
|
@@ -1,12 +1,28 @@
|
|
| 1 |
from PyPDF2 import PdfReader
|
| 2 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def get_pdf_page_count(pdf_path):
|
| 5 |
reader = PdfReader(pdf_path)
|
| 6 |
return len(reader.pages)
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def inference(pdf_path, page_num):
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
title = "OCR Arena"
|
| 12 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
@@ -30,10 +46,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
| 30 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 31 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 32 |
|
| 33 |
-
submit_btn.click(inference, inputs=[pdf, page_num], outputs=
|
| 34 |
|
| 35 |
with gr.Column():
|
| 36 |
-
|
| 37 |
|
| 38 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
| 39 |
|
|
|
|
| 1 |
from PyPDF2 import PdfReader
|
| 2 |
import gradio as gr
|
| 3 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 4 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 5 |
+
from docling.datamodel.base_models import InputFormat
|
| 6 |
+
|
| 7 |
+
pipeline_options = PdfPipelineOptions(enable_remote_services=True)
|
| 8 |
+
converter = DocumentConverter(
|
| 9 |
+
format_options={
|
| 10 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
| 11 |
+
}
|
| 12 |
+
)
|
| 13 |
|
| 14 |
def get_pdf_page_count(pdf_path):
|
| 15 |
reader = PdfReader(pdf_path)
|
| 16 |
return len(reader.pages)
|
| 17 |
|
| 18 |
+
def get_docling_ocr(pdf_path, page_num):
|
| 19 |
+
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
| 20 |
+
markdown_text_docling = result.document.export_to_markdown()
|
| 21 |
+
return markdown_text_docling
|
| 22 |
+
|
| 23 |
def inference(pdf_path, page_num):
|
| 24 |
+
docling_ocr = get_docling_ocr(pdf_path, page_num)
|
| 25 |
+
return docling_ocr
|
| 26 |
|
| 27 |
title = "OCR Arena"
|
| 28 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
|
| 46 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 47 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 48 |
|
| 49 |
+
submit_btn.click(inference, inputs=[pdf, page_num], outputs=docling_ocr_out)
|
| 50 |
|
| 51 |
with gr.Column():
|
| 52 |
+
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
|
| 53 |
|
| 54 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
| 55 |
|
requirements.txt
CHANGED
|
@@ -1,13 +1,23 @@
|
|
| 1 |
aiofiles==24.1.0
|
| 2 |
annotated-types==0.7.0
|
| 3 |
anyio==4.9.0
|
|
|
|
|
|
|
| 4 |
certifi==2025.6.15
|
| 5 |
charset-normalizer==3.4.2
|
| 6 |
click==8.2.1
|
| 7 |
colorama==0.4.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
fastapi==0.115.14
|
| 9 |
ffmpy==0.6.0
|
| 10 |
filelock==3.18.0
|
|
|
|
| 11 |
fsspec==2025.5.1
|
| 12 |
gradio==5.35.0
|
| 13 |
gradio_client==1.10.4
|
|
@@ -17,35 +27,79 @@ httpcore==1.0.9
|
|
| 17 |
httpx==0.28.1
|
| 18 |
huggingface-hub==0.33.1
|
| 19 |
idna==3.10
|
|
|
|
| 20 |
Jinja2==3.1.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
markdown-it-py==3.0.0
|
|
|
|
| 22 |
MarkupSafe==3.0.2
|
| 23 |
mdurl==0.1.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
numpy==2.2.6
|
|
|
|
|
|
|
| 25 |
orjson==3.10.18
|
| 26 |
packaging==25.0
|
| 27 |
pandas==2.3.0
|
| 28 |
pillow==11.2.1
|
|
|
|
|
|
|
| 29 |
pydantic==2.11.7
|
|
|
|
| 30 |
pydantic_core==2.33.2
|
| 31 |
pydub==0.25.1
|
| 32 |
Pygments==2.19.2
|
|
|
|
| 33 |
PyPDF2==3.0.1
|
|
|
|
|
|
|
| 34 |
python-dateutil==2.9.0.post0
|
|
|
|
|
|
|
| 35 |
python-multipart==0.0.20
|
|
|
|
| 36 |
pytz==2025.2
|
|
|
|
| 37 |
PyYAML==6.0.2
|
|
|
|
|
|
|
| 38 |
requests==2.32.4
|
| 39 |
rich==14.0.0
|
|
|
|
|
|
|
| 40 |
ruff==0.12.1
|
| 41 |
safehttpx==0.1.6
|
|
|
|
|
|
|
|
|
|
| 42 |
semantic-version==2.10.0
|
|
|
|
|
|
|
|
|
|
| 43 |
shellingham==1.5.4
|
| 44 |
six==1.17.0
|
| 45 |
sniffio==1.3.1
|
|
|
|
| 46 |
starlette==0.46.2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
tomlkit==0.13.3
|
|
|
|
|
|
|
| 48 |
tqdm==4.67.1
|
|
|
|
| 49 |
typer==0.16.0
|
| 50 |
typing-inspection==0.4.1
|
| 51 |
typing_extensions==4.14.0
|
|
@@ -53,3 +107,4 @@ tzdata==2025.2
|
|
| 53 |
urllib3==2.5.0
|
| 54 |
uvicorn==0.34.3
|
| 55 |
websockets==15.0.1
|
|
|
|
|
|
| 1 |
aiofiles==24.1.0
|
| 2 |
annotated-types==0.7.0
|
| 3 |
anyio==4.9.0
|
| 4 |
+
attrs==25.3.0
|
| 5 |
+
beautifulsoup4==4.13.4
|
| 6 |
certifi==2025.6.15
|
| 7 |
charset-normalizer==3.4.2
|
| 8 |
click==8.2.1
|
| 9 |
colorama==0.4.6
|
| 10 |
+
dill==0.4.0
|
| 11 |
+
docling==2.39.0
|
| 12 |
+
docling-core==2.39.0
|
| 13 |
+
docling-ibm-models==3.6.0
|
| 14 |
+
docling-parse==4.1.0
|
| 15 |
+
easyocr==1.7.2
|
| 16 |
+
et_xmlfile==2.0.0
|
| 17 |
fastapi==0.115.14
|
| 18 |
ffmpy==0.6.0
|
| 19 |
filelock==3.18.0
|
| 20 |
+
filetype==1.2.0
|
| 21 |
fsspec==2025.5.1
|
| 22 |
gradio==5.35.0
|
| 23 |
gradio_client==1.10.4
|
|
|
|
| 27 |
httpx==0.28.1
|
| 28 |
huggingface-hub==0.33.1
|
| 29 |
idna==3.10
|
| 30 |
+
imageio==2.37.0
|
| 31 |
Jinja2==3.1.6
|
| 32 |
+
jsonlines==3.1.0
|
| 33 |
+
jsonref==1.1.0
|
| 34 |
+
jsonschema==4.24.0
|
| 35 |
+
jsonschema-specifications==2025.4.1
|
| 36 |
+
latex2mathml==3.78.0
|
| 37 |
+
lazy_loader==0.4
|
| 38 |
+
lxml==5.4.0
|
| 39 |
markdown-it-py==3.0.0
|
| 40 |
+
marko==2.1.4
|
| 41 |
MarkupSafe==3.0.2
|
| 42 |
mdurl==0.1.2
|
| 43 |
+
mpire==2.10.2
|
| 44 |
+
mpmath==1.3.0
|
| 45 |
+
multiprocess==0.70.18
|
| 46 |
+
networkx==3.5
|
| 47 |
+
ninja==1.11.1.4
|
| 48 |
numpy==2.2.6
|
| 49 |
+
opencv-python-headless==4.11.0.86
|
| 50 |
+
openpyxl==3.1.5
|
| 51 |
orjson==3.10.18
|
| 52 |
packaging==25.0
|
| 53 |
pandas==2.3.0
|
| 54 |
pillow==11.2.1
|
| 55 |
+
pluggy==1.6.0
|
| 56 |
+
pyclipper==1.3.0.post6
|
| 57 |
pydantic==2.11.7
|
| 58 |
+
pydantic-settings==2.10.1
|
| 59 |
pydantic_core==2.33.2
|
| 60 |
pydub==0.25.1
|
| 61 |
Pygments==2.19.2
|
| 62 |
+
pylatexenc==2.10
|
| 63 |
PyPDF2==3.0.1
|
| 64 |
+
pypdfium2==4.30.1
|
| 65 |
+
python-bidi==0.6.6
|
| 66 |
python-dateutil==2.9.0.post0
|
| 67 |
+
python-docx==1.2.0
|
| 68 |
+
python-dotenv==1.1.1
|
| 69 |
python-multipart==0.0.20
|
| 70 |
+
python-pptx==1.0.2
|
| 71 |
pytz==2025.2
|
| 72 |
+
pywin32==310
|
| 73 |
PyYAML==6.0.2
|
| 74 |
+
referencing==0.36.2
|
| 75 |
+
regex==2024.11.6
|
| 76 |
requests==2.32.4
|
| 77 |
rich==14.0.0
|
| 78 |
+
rpds-py==0.25.1
|
| 79 |
+
rtree==1.4.0
|
| 80 |
ruff==0.12.1
|
| 81 |
safehttpx==0.1.6
|
| 82 |
+
safetensors==0.5.3
|
| 83 |
+
scikit-image==0.25.2
|
| 84 |
+
scipy==1.16.0
|
| 85 |
semantic-version==2.10.0
|
| 86 |
+
semchunk==2.2.2
|
| 87 |
+
setuptools==80.9.0
|
| 88 |
+
shapely==2.1.1
|
| 89 |
shellingham==1.5.4
|
| 90 |
six==1.17.0
|
| 91 |
sniffio==1.3.1
|
| 92 |
+
soupsieve==2.7
|
| 93 |
starlette==0.46.2
|
| 94 |
+
sympy==1.14.0
|
| 95 |
+
tabulate==0.9.0
|
| 96 |
+
tifffile==2025.6.11
|
| 97 |
+
tokenizers==0.21.2
|
| 98 |
tomlkit==0.13.3
|
| 99 |
+
torch==2.7.1
|
| 100 |
+
torchvision==0.22.1
|
| 101 |
tqdm==4.67.1
|
| 102 |
+
transformers==4.53.0
|
| 103 |
typer==0.16.0
|
| 104 |
typing-inspection==0.4.1
|
| 105 |
typing_extensions==4.14.0
|
|
|
|
| 107 |
urllib3==2.5.0
|
| 108 |
uvicorn==0.34.3
|
| 109 |
websockets==15.0.1
|
| 110 |
+
xlsxwriter==3.2.5
|