import tempfile import uuid from os.path import join from pathlib import Path from typing import AnyStr from domain.PdfSegment import PdfSegment from pdf_features import PdfFeatures, Rectangle from pdf_token_type_labels import TokenType from ports.services.toc_service import TOCService from configuration import service_logger from adapters.infrastructure.toc.TOCExtractor import TOCExtractor from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation TITLE_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER} SKIP_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.PAGE_HEADER, TokenType.PICTURE} class TOCServiceAdapter(TOCService): def extract_table_of_contents( self, pdf_content: AnyStr, segment_boxes: list[dict], skip_document_name=False ) -> list[dict]: service_logger.info("Getting TOC") pdf_path = self._pdf_content_to_pdf_path(pdf_content) pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path) pdf_segments: list[PdfSegment] = self._get_pdf_segments_from_segment_boxes(pdf_features, segment_boxes) title_segments = [segment for segment in pdf_segments if segment.segment_type in TITLE_TYPES] if skip_document_name: self._skip_name_of_the_document(pdf_segments, title_segments) pdf_segmentation: PdfSegmentation = PdfSegmentation(pdf_features, title_segments) toc_instance: TOCExtractor = TOCExtractor(pdf_segmentation) return toc_instance.to_dict() def format_toc_for_uwazi(self, toc_items: list[dict]) -> list[dict]: toc_compatible = [] for toc_item in toc_items: toc_compatible.append(toc_item.copy()) toc_compatible[-1]["bounding_box"]["left"] = int(toc_item["bounding_box"]["left"] / 0.75) toc_compatible[-1]["bounding_box"]["top"] = int(toc_item["bounding_box"]["top"] / 0.75) toc_compatible[-1]["bounding_box"]["width"] = int(toc_item["bounding_box"]["width"] / 0.75) toc_compatible[-1]["bounding_box"]["height"] = int(toc_item["bounding_box"]["height"] / 0.75) toc_compatible[-1]["selectionRectangles"] = [toc_compatible[-1]["bounding_box"]] del toc_compatible[-1]["bounding_box"] return toc_compatible def _get_file_path(self, file_name: str, extension: str) -> str: return join(tempfile.gettempdir(), file_name + "." + extension) def _pdf_content_to_pdf_path(self, file_content: AnyStr) -> Path: file_id = str(uuid.uuid1()) pdf_path = Path(self._get_file_path(file_id, "pdf")) pdf_path.write_bytes(file_content) return pdf_path def _skip_name_of_the_document(self, pdf_segments: list[PdfSegment], title_segments: list[PdfSegment]) -> None: segments_to_remove = [] last_segment = None for segment in pdf_segments: if segment.segment_type not in SKIP_TYPES: break if segment.segment_type == TokenType.PAGE_HEADER or segment.segment_type == TokenType.PICTURE: continue if not last_segment: last_segment = segment else: if segment.bounding_box.right < last_segment.bounding_box.left + last_segment.bounding_box.width * 0.66: break last_segment = segment if segment.segment_type in TITLE_TYPES: segments_to_remove.append(segment) for segment in segments_to_remove: title_segments.remove(segment) def _get_pdf_segments_from_segment_boxes(self, pdf_features: PdfFeatures, segment_boxes: list[dict]) -> list[PdfSegment]: pdf_segments: list[PdfSegment] = [] for segment_box in segment_boxes: left, top, width, height = segment_box["left"], segment_box["top"], segment_box["width"], segment_box["height"] bounding_box = Rectangle.from_width_height(left, top, width, height) segment_type = TokenType.from_value(segment_box["type"]) pdf_name = pdf_features.file_name segment = PdfSegment(segment_box["page_number"], bounding_box, segment_box["text"], segment_type, pdf_name) pdf_segments.append(segment) return pdf_segments