taprosoft
commited on
Commit
·
3bce890
1
Parent(s):
36add35
feat: add enable visualization flag
Browse files- app.py +33 -9
- backends/docling.py +4 -2
- backends/marker.py +3 -1
- backends/mineru.py +4 -0
- backends/settings.py +3 -0
- backends/unstructured.py +3 -1
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from backends import (
|
|
| 11 |
convert_mineru,
|
| 12 |
convert_unstructured,
|
| 13 |
)
|
|
|
|
| 14 |
from utils import remove_images_from_markdown, trim_pages
|
| 15 |
|
| 16 |
TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
|
|
@@ -18,9 +19,9 @@ TRIMMED_PDF_PATH.mkdir(exist_ok=True)
|
|
| 18 |
|
| 19 |
|
| 20 |
def convert_document(path, method, enabled=True):
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
return "", "", []
|
| 25 |
|
| 26 |
# benchmarking
|
|
@@ -84,7 +85,6 @@ with gr.Blocks(
|
|
| 84 |
output_tabs = []
|
| 85 |
visualization_sub_tabs = []
|
| 86 |
first_method = supported_methods[0]
|
| 87 |
-
num_methods = len(supported_methods)
|
| 88 |
|
| 89 |
with gr.Row():
|
| 90 |
with gr.Column(variant="panel", scale=5):
|
|
@@ -106,7 +106,9 @@ with gr.Blocks(
|
|
| 106 |
)
|
| 107 |
with gr.Row():
|
| 108 |
visual_checkbox = gr.Checkbox(
|
| 109 |
-
label="Enable debug visualizations",
|
|
|
|
|
|
|
| 110 |
)
|
| 111 |
with gr.Row():
|
| 112 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
|
@@ -134,7 +136,10 @@ with gr.Blocks(
|
|
| 134 |
line_breaks=True,
|
| 135 |
latex_delimiters=latex_delimiters,
|
| 136 |
)
|
| 137 |
-
with gr.Tab(
|
|
|
|
|
|
|
|
|
|
| 138 |
debug_images = gr.Gallery(
|
| 139 |
show_label=False,
|
| 140 |
container=False,
|
|
@@ -159,16 +164,35 @@ with gr.Blocks(
|
|
| 159 |
)
|
| 160 |
for idx, method in enumerate(supported_methods):
|
| 161 |
|
| 162 |
-
def progress_message(
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
def process_method(input_file, selected_methods, method=method):
|
|
|
|
|
|
|
| 166 |
return convert_document(
|
| 167 |
input_file, method=method, enabled=method in selected_methods
|
| 168 |
)
|
| 169 |
|
| 170 |
click_event = click_event.then(
|
| 171 |
-
fn=lambda
|
|
|
|
| 172 |
outputs=[progress_status],
|
| 173 |
).then(
|
| 174 |
fn=lambda input_file, methods, method=method: process_method(
|
|
|
|
| 11 |
convert_mineru,
|
| 12 |
convert_unstructured,
|
| 13 |
)
|
| 14 |
+
from backends.settings import ENABLE_DEBUG_MODE
|
| 15 |
from utils import remove_images_from_markdown, trim_pages
|
| 16 |
|
| 17 |
TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def convert_document(path, method, enabled=True):
|
| 22 |
+
if enabled:
|
| 23 |
+
print("Processing file", path, "with method", method)
|
| 24 |
+
else:
|
| 25 |
return "", "", []
|
| 26 |
|
| 27 |
# benchmarking
|
|
|
|
| 85 |
output_tabs = []
|
| 86 |
visualization_sub_tabs = []
|
| 87 |
first_method = supported_methods[0]
|
|
|
|
| 88 |
|
| 89 |
with gr.Row():
|
| 90 |
with gr.Column(variant="panel", scale=5):
|
|
|
|
| 106 |
)
|
| 107 |
with gr.Row():
|
| 108 |
visual_checkbox = gr.Checkbox(
|
| 109 |
+
label="Enable debug visualizations",
|
| 110 |
+
visible=ENABLE_DEBUG_MODE,
|
| 111 |
+
value=True,
|
| 112 |
)
|
| 113 |
with gr.Row():
|
| 114 |
convert_btn = gr.Button("Convert", variant="primary", scale=2)
|
|
|
|
| 136 |
line_breaks=True,
|
| 137 |
latex_delimiters=latex_delimiters,
|
| 138 |
)
|
| 139 |
+
with gr.Tab(
|
| 140 |
+
"Debug visualizations",
|
| 141 |
+
visible=ENABLE_DEBUG_MODE,
|
| 142 |
+
) as visual_sub_tab:
|
| 143 |
debug_images = gr.Gallery(
|
| 144 |
show_label=False,
|
| 145 |
container=False,
|
|
|
|
| 164 |
)
|
| 165 |
for idx, method in enumerate(supported_methods):
|
| 166 |
|
| 167 |
+
def progress_message(selected_methods, method=method):
|
| 168 |
+
selected_methods_indices = [
|
| 169 |
+
idx
|
| 170 |
+
for idx, current_method in enumerate(supported_methods)
|
| 171 |
+
if current_method in selected_methods
|
| 172 |
+
]
|
| 173 |
+
try:
|
| 174 |
+
current_method_idx = selected_methods_indices.index(
|
| 175 |
+
supported_methods.index(method)
|
| 176 |
+
)
|
| 177 |
+
msg = (
|
| 178 |
+
f"Processing ({current_method_idx + 1} / "
|
| 179 |
+
f"{len(selected_methods)}) **{method}**...\n\n"
|
| 180 |
+
)
|
| 181 |
+
except ValueError:
|
| 182 |
+
msg = gr.update()
|
| 183 |
+
|
| 184 |
+
return msg
|
| 185 |
|
| 186 |
def process_method(input_file, selected_methods, method=method):
|
| 187 |
+
if input_file is None:
|
| 188 |
+
raise ValueError("Please upload a PDF file first!")
|
| 189 |
return convert_document(
|
| 190 |
input_file, method=method, enabled=method in selected_methods
|
| 191 |
)
|
| 192 |
|
| 193 |
click_event = click_event.then(
|
| 194 |
+
fn=lambda methods, method=method: progress_message(methods, method),
|
| 195 |
+
inputs=[methods],
|
| 196 |
outputs=[progress_status],
|
| 197 |
).then(
|
| 198 |
fn=lambda input_file, methods, method=method: process_method(
|
backends/docling.py
CHANGED
|
@@ -10,6 +10,8 @@ from docling.datamodel.settings import settings
|
|
| 10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 11 |
from docling_core.types.doc import ImageRefMode
|
| 12 |
|
|
|
|
|
|
|
| 13 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
| 14 |
|
| 15 |
# Docling settings
|
|
@@ -23,8 +25,8 @@ pipeline_options.images_scale = 2.0
|
|
| 23 |
|
| 24 |
# debug visualization settings
|
| 25 |
settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
|
| 26 |
-
settings.debug.visualize_layout =
|
| 27 |
-
settings.debug.visualize_tables =
|
| 28 |
|
| 29 |
# Docling init
|
| 30 |
docling_converter = DocumentConverter(
|
|
|
|
| 10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 11 |
from docling_core.types.doc import ImageRefMode
|
| 12 |
|
| 13 |
+
from .settings import ENABLE_DEBUG_MODE
|
| 14 |
+
|
| 15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
| 16 |
|
| 17 |
# Docling settings
|
|
|
|
| 25 |
|
| 26 |
# debug visualization settings
|
| 27 |
settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
|
| 28 |
+
settings.debug.visualize_layout = ENABLE_DEBUG_MODE
|
| 29 |
+
settings.debug.visualize_tables = ENABLE_DEBUG_MODE
|
| 30 |
|
| 31 |
# Docling init
|
| 32 |
docling_converter = DocumentConverter(
|
backends/marker.py
CHANGED
|
@@ -8,11 +8,13 @@ from marker.models import create_model_dict
|
|
| 8 |
from marker.output import text_from_rendered
|
| 9 |
from marker.settings import settings
|
| 10 |
|
|
|
|
|
|
|
| 11 |
# Marker init
|
| 12 |
marker_converter = PdfConverter(
|
| 13 |
artifact_dict=create_model_dict(),
|
| 14 |
config={
|
| 15 |
-
"debug_pdf_images":
|
| 16 |
},
|
| 17 |
)
|
| 18 |
|
|
|
|
| 8 |
from marker.output import text_from_rendered
|
| 9 |
from marker.settings import settings
|
| 10 |
|
| 11 |
+
from .settings import ENABLE_DEBUG_MODE
|
| 12 |
+
|
| 13 |
# Marker init
|
| 14 |
marker_converter = PdfConverter(
|
| 15 |
artifact_dict=create_model_dict(),
|
| 16 |
config={
|
| 17 |
+
"debug_pdf_images": ENABLE_DEBUG_MODE,
|
| 18 |
},
|
| 19 |
)
|
| 20 |
|
backends/mineru.py
CHANGED
|
@@ -7,6 +7,8 @@ import pymupdf
|
|
| 7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
| 8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 9 |
|
|
|
|
|
|
|
| 10 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
| 11 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
| 12 |
|
|
@@ -48,6 +50,8 @@ def do_process_mineru(input_path, output_dir):
|
|
| 48 |
parse_method,
|
| 49 |
debug_able=False,
|
| 50 |
f_dump_orig_pdf=False,
|
|
|
|
|
|
|
| 51 |
formula_enable=False,
|
| 52 |
table_enable=True,
|
| 53 |
)
|
|
|
|
| 7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
| 8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 9 |
|
| 10 |
+
from .settings import ENABLE_DEBUG_MODE
|
| 11 |
+
|
| 12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
| 13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
| 14 |
|
|
|
|
| 50 |
parse_method,
|
| 51 |
debug_able=False,
|
| 52 |
f_dump_orig_pdf=False,
|
| 53 |
+
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
| 54 |
+
f_draw_char_bbox=ENABLE_DEBUG_MODE,
|
| 55 |
formula_enable=False,
|
| 56 |
table_enable=True,
|
| 57 |
)
|
backends/settings.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
backends/unstructured.py
CHANGED
|
@@ -5,6 +5,8 @@ from matplotlib import font_manager
|
|
| 5 |
from unstructured.partition.pdf import partition_pdf
|
| 6 |
from unstructured.partition.pdf_image.analysis import bbox_visualisation
|
| 7 |
|
|
|
|
|
|
|
| 8 |
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
|
| 9 |
|
| 10 |
|
|
@@ -59,7 +61,7 @@ def convert_unstructured(path: str, file_name: str):
|
|
| 59 |
# extract_images_in_pdf=True,
|
| 60 |
extract_image_block_types=["Image", "Table"],
|
| 61 |
extract_image_block_to_payload=True,
|
| 62 |
-
analysis=
|
| 63 |
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
|
| 64 |
)
|
| 65 |
text = convert_elements_to_markdown(elements)
|
|
|
|
| 5 |
from unstructured.partition.pdf import partition_pdf
|
| 6 |
from unstructured.partition.pdf_image.analysis import bbox_visualisation
|
| 7 |
|
| 8 |
+
from .settings import ENABLE_DEBUG_MODE
|
| 9 |
+
|
| 10 |
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
|
| 11 |
|
| 12 |
|
|
|
|
| 61 |
# extract_images_in_pdf=True,
|
| 62 |
extract_image_block_types=["Image", "Table"],
|
| 63 |
extract_image_block_to_payload=True,
|
| 64 |
+
analysis=ENABLE_DEBUG_MODE,
|
| 65 |
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
|
| 66 |
)
|
| 67 |
text = convert_elements_to_markdown(elements)
|