File size: 8,513 Bytes
fcaa164 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import logging
from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options)
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None
if self.enabled:
install_errmsg = (
"tesserocr is not correctly installed. "
"Please install it via `pip install tesserocr` to use this OCR engine. "
"Note that tesserocr might have to be manually compiled for working with "
"your Tesseract installation. The Docling documentation provides examples for it. "
"Alternatively, Docling has support for other OCR engines. See the documentation: "
"https://ds4sd.github.io/docling/installation/"
)
missing_langs_errmsg = (
"tesserocr is not correctly configured. No language models have been detected. "
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
"You can find more information how to setup other OCR engines in Docling "
"documentation: "
"https://ds4sd.github.io/docling/installation/"
)
try:
import tesserocr
except ImportError:
raise ImportError(install_errmsg)
try:
tesseract_version = tesserocr.tesseract_version()
except:
raise ImportError(install_errmsg)
_, self._tesserocr_languages = tesserocr.get_languages()
if not self._tesserocr_languages:
raise ImportError(missing_langs_errmsg)
# Initialize the tesseractAPI
_log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang)
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if any([l.startswith("script/") for l in self._tesserocr_languages]):
self.script_prefix = "script/"
else:
self.script_prefix = ""
tesserocr_kwargs = {
"psm": tesserocr.PSM.AUTO,
"init": True,
"oem": tesserocr.OEM.DEFAULT,
}
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
)
self.reader_RIL = tesserocr.RIL
def __del__(self):
if self.reader is not None:
# Finalize the tesseractAPI
self.reader.End()
for script in self.script_readers:
self.script_readers[script].End()
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None
assert self._tesserocr_languages is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
local_reader = self.reader
if "auto" in self.options.lang:
assert self.osd_reader is not None
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, probably
if osd is None:
continue
script = osd["script_name"]
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
# Check if the detected languge is present in the system
if lang not in self._tesserocr_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."
_log.warning(msg)
else:
if script not in self.script_readers:
import tesserocr
self.script_readers[script] = (
tesserocr.PyTessBaseAPI(
path=self.reader.GetDatapath(),
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
)
local_reader = self.script_readers[script]
local_reader.SetImage(high_res_image)
boxes = local_reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)
cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
local_reader.SetRectangle(
box["x"], box["y"], box["w"], box["h"]
)
# Extract text within the bounding box
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
)
# del high_res_image
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page
|