Wasim
Sync: robust vehicle parser + full project
2e237ce
import tempfile
import uuid
from os.path import join
from pathlib import Path
from typing import AnyStr
from domain.PdfSegment import PdfSegment
from pdf_features import PdfFeatures, Rectangle
from pdf_token_type_labels import TokenType
from ports.services.toc_service import TOCService
from configuration import service_logger
from adapters.infrastructure.toc.TOCExtractor import TOCExtractor
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
TITLE_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER}
SKIP_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.PAGE_HEADER, TokenType.PICTURE}
class TOCServiceAdapter(TOCService):
def extract_table_of_contents(
self, pdf_content: AnyStr, segment_boxes: list[dict], skip_document_name=False
) -> list[dict]:
service_logger.info("Getting TOC")
pdf_path = self._pdf_content_to_pdf_path(pdf_content)
pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path)
pdf_segments: list[PdfSegment] = self._get_pdf_segments_from_segment_boxes(pdf_features, segment_boxes)
title_segments = [segment for segment in pdf_segments if segment.segment_type in TITLE_TYPES]
if skip_document_name:
self._skip_name_of_the_document(pdf_segments, title_segments)
pdf_segmentation: PdfSegmentation = PdfSegmentation(pdf_features, title_segments)
toc_instance: TOCExtractor = TOCExtractor(pdf_segmentation)
return toc_instance.to_dict()
def format_toc_for_uwazi(self, toc_items: list[dict]) -> list[dict]:
toc_compatible = []
for toc_item in toc_items:
toc_compatible.append(toc_item.copy())
toc_compatible[-1]["bounding_box"]["left"] = int(toc_item["bounding_box"]["left"] / 0.75)
toc_compatible[-1]["bounding_box"]["top"] = int(toc_item["bounding_box"]["top"] / 0.75)
toc_compatible[-1]["bounding_box"]["width"] = int(toc_item["bounding_box"]["width"] / 0.75)
toc_compatible[-1]["bounding_box"]["height"] = int(toc_item["bounding_box"]["height"] / 0.75)
toc_compatible[-1]["selectionRectangles"] = [toc_compatible[-1]["bounding_box"]]
del toc_compatible[-1]["bounding_box"]
return toc_compatible
def _get_file_path(self, file_name: str, extension: str) -> str:
return join(tempfile.gettempdir(), file_name + "." + extension)
def _pdf_content_to_pdf_path(self, file_content: AnyStr) -> Path:
file_id = str(uuid.uuid1())
pdf_path = Path(self._get_file_path(file_id, "pdf"))
pdf_path.write_bytes(file_content)
return pdf_path
def _skip_name_of_the_document(self, pdf_segments: list[PdfSegment], title_segments: list[PdfSegment]) -> None:
segments_to_remove = []
last_segment = None
for segment in pdf_segments:
if segment.segment_type not in SKIP_TYPES:
break
if segment.segment_type == TokenType.PAGE_HEADER or segment.segment_type == TokenType.PICTURE:
continue
if not last_segment:
last_segment = segment
else:
if segment.bounding_box.right < last_segment.bounding_box.left + last_segment.bounding_box.width * 0.66:
break
last_segment = segment
if segment.segment_type in TITLE_TYPES:
segments_to_remove.append(segment)
for segment in segments_to_remove:
title_segments.remove(segment)
def _get_pdf_segments_from_segment_boxes(self, pdf_features: PdfFeatures, segment_boxes: list[dict]) -> list[PdfSegment]:
pdf_segments: list[PdfSegment] = []
for segment_box in segment_boxes:
left, top, width, height = segment_box["left"], segment_box["top"], segment_box["width"], segment_box["height"]
bounding_box = Rectangle.from_width_height(left, top, width, height)
segment_type = TokenType.from_value(segment_box["type"])
pdf_name = pdf_features.file_name
segment = PdfSegment(segment_box["page_number"], bounding_box, segment_box["text"], segment_type, pdf_name)
pdf_segments.append(segment)
return pdf_segments