from typing import AnyStr from domain.PdfImages import PdfImages from domain.SegmentBox import SegmentBox from ports.services.pdf_analysis_service import PDFAnalysisService from ports.services.ml_model_service import MLModelService from ports.services.format_conversion_service import FormatConversionService from ports.repositories.file_repository import FileRepository from configuration import service_logger class PDFAnalysisServiceAdapter(PDFAnalysisService): def __init__( self, vgt_model_service: MLModelService, fast_model_service: MLModelService, format_conversion_service: FormatConversionService, file_repository: FileRepository, ): self.vgt_model_service = vgt_model_service self.fast_model_service = fast_model_service self.format_conversion_service = format_conversion_service self.file_repository = file_repository def analyze_pdf_layout( self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False ) -> list[dict]: pdf_path = self.file_repository.save_pdf(pdf_content) service_logger.info("Creating PDF images") pdf_images_list: list[PdfImages] = [PdfImages.from_pdf_path(pdf_path, "", xml_filename)] predicted_segments = self.vgt_model_service.predict_document_layout(pdf_images_list) if parse_tables_and_math: pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200) self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments) self.format_conversion_service.convert_table_to_html(pdf_images_200_dpi, predicted_segments) if not keep_pdf: self.file_repository.delete_file(pdf_path) return [ SegmentBox.from_pdf_segment(pdf_segment, pdf_images_list[0].pdf_features.pages).to_dict() for pdf_segment in predicted_segments ] def analyze_pdf_layout_fast( self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False ) -> list[dict]: pdf_path = self.file_repository.save_pdf(pdf_content) service_logger.info("Creating PDF images for fast analysis") pdf_images_list: list[PdfImages] = [PdfImages.from_pdf_path(pdf_path, "", xml_filename)] predicted_segments = self.fast_model_service.predict_layout_fast(pdf_images_list) if parse_tables_and_math: pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200) self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments) self.format_conversion_service.convert_table_to_html(pdf_images_list[0], predicted_segments) if not keep_pdf: self.file_repository.delete_file(pdf_path) return [ SegmentBox.from_pdf_segment(pdf_segment, pdf_images_list[0].pdf_features.pages).to_dict() for pdf_segment in predicted_segments ]