Spaces:
Running
Running
File size: 4,260 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import tempfile
import uuid
from os.path import join
from pathlib import Path
from typing import AnyStr
from domain.PdfSegment import PdfSegment
from pdf_features import PdfFeatures, Rectangle
from pdf_token_type_labels import TokenType
from ports.services.toc_service import TOCService
from configuration import service_logger
from adapters.infrastructure.toc.TOCExtractor import TOCExtractor
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
TITLE_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER}
SKIP_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.PAGE_HEADER, TokenType.PICTURE}
class TOCServiceAdapter(TOCService):
def extract_table_of_contents(
self, pdf_content: AnyStr, segment_boxes: list[dict], skip_document_name=False
) -> list[dict]:
service_logger.info("Getting TOC")
pdf_path = self._pdf_content_to_pdf_path(pdf_content)
pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path)
pdf_segments: list[PdfSegment] = self._get_pdf_segments_from_segment_boxes(pdf_features, segment_boxes)
title_segments = [segment for segment in pdf_segments if segment.segment_type in TITLE_TYPES]
if skip_document_name:
self._skip_name_of_the_document(pdf_segments, title_segments)
pdf_segmentation: PdfSegmentation = PdfSegmentation(pdf_features, title_segments)
toc_instance: TOCExtractor = TOCExtractor(pdf_segmentation)
return toc_instance.to_dict()
def format_toc_for_uwazi(self, toc_items: list[dict]) -> list[dict]:
toc_compatible = []
for toc_item in toc_items:
toc_compatible.append(toc_item.copy())
toc_compatible[-1]["bounding_box"]["left"] = int(toc_item["bounding_box"]["left"] / 0.75)
toc_compatible[-1]["bounding_box"]["top"] = int(toc_item["bounding_box"]["top"] / 0.75)
toc_compatible[-1]["bounding_box"]["width"] = int(toc_item["bounding_box"]["width"] / 0.75)
toc_compatible[-1]["bounding_box"]["height"] = int(toc_item["bounding_box"]["height"] / 0.75)
toc_compatible[-1]["selectionRectangles"] = [toc_compatible[-1]["bounding_box"]]
del toc_compatible[-1]["bounding_box"]
return toc_compatible
def _get_file_path(self, file_name: str, extension: str) -> str:
return join(tempfile.gettempdir(), file_name + "." + extension)
def _pdf_content_to_pdf_path(self, file_content: AnyStr) -> Path:
file_id = str(uuid.uuid1())
pdf_path = Path(self._get_file_path(file_id, "pdf"))
pdf_path.write_bytes(file_content)
return pdf_path
def _skip_name_of_the_document(self, pdf_segments: list[PdfSegment], title_segments: list[PdfSegment]) -> None:
segments_to_remove = []
last_segment = None
for segment in pdf_segments:
if segment.segment_type not in SKIP_TYPES:
break
if segment.segment_type == TokenType.PAGE_HEADER or segment.segment_type == TokenType.PICTURE:
continue
if not last_segment:
last_segment = segment
else:
if segment.bounding_box.right < last_segment.bounding_box.left + last_segment.bounding_box.width * 0.66:
break
last_segment = segment
if segment.segment_type in TITLE_TYPES:
segments_to_remove.append(segment)
for segment in segments_to_remove:
title_segments.remove(segment)
def _get_pdf_segments_from_segment_boxes(self, pdf_features: PdfFeatures, segment_boxes: list[dict]) -> list[PdfSegment]:
pdf_segments: list[PdfSegment] = []
for segment_box in segment_boxes:
left, top, width, height = segment_box["left"], segment_box["top"], segment_box["width"], segment_box["height"]
bounding_box = Rectangle.from_width_height(left, top, width, height)
segment_type = TokenType.from_value(segment_box["type"])
pdf_name = pdf_features.file_name
segment = PdfSegment(segment_box["page_number"], bounding_box, segment_box["text"], segment_type, pdf_name)
pdf_segments.append(segment)
return pdf_segments
|