PDF-Data_Extractor / src /adapters /infrastructure /pdf_analysis_service_adapter.py
Wasim
Sync: robust vehicle parser + full project
2e237ce
from typing import AnyStr
from domain.PdfImages import PdfImages
from domain.SegmentBox import SegmentBox
from ports.services.pdf_analysis_service import PDFAnalysisService
from ports.services.ml_model_service import MLModelService
from ports.services.format_conversion_service import FormatConversionService
from ports.repositories.file_repository import FileRepository
from configuration import service_logger
class PDFAnalysisServiceAdapter(PDFAnalysisService):
def __init__(
self,
vgt_model_service: MLModelService,
fast_model_service: MLModelService,
format_conversion_service: FormatConversionService,
file_repository: FileRepository,
):
self.vgt_model_service = vgt_model_service
self.fast_model_service = fast_model_service
self.format_conversion_service = format_conversion_service
self.file_repository = file_repository
def analyze_pdf_layout(
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
) -> list[dict]:
pdf_path = self.file_repository.save_pdf(pdf_content)
service_logger.info("Creating PDF images")
pdf_images_list: list[PdfImages] = [PdfImages.from_pdf_path(pdf_path, "", xml_filename)]
predicted_segments = self.vgt_model_service.predict_document_layout(pdf_images_list)
if parse_tables_and_math:
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
self.format_conversion_service.convert_table_to_html(pdf_images_200_dpi, predicted_segments)
if not keep_pdf:
self.file_repository.delete_file(pdf_path)
return [
SegmentBox.from_pdf_segment(pdf_segment, pdf_images_list[0].pdf_features.pages).to_dict()
for pdf_segment in predicted_segments
]
def analyze_pdf_layout_fast(
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
) -> list[dict]:
pdf_path = self.file_repository.save_pdf(pdf_content)
service_logger.info("Creating PDF images for fast analysis")
pdf_images_list: list[PdfImages] = [PdfImages.from_pdf_path(pdf_path, "", xml_filename)]
predicted_segments = self.fast_model_service.predict_layout_fast(pdf_images_list)
if parse_tables_and_math:
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
self.format_conversion_service.convert_table_to_html(pdf_images_list[0], predicted_segments)
if not keep_pdf:
self.file_repository.delete_file(pdf_path)
return [
SegmentBox.from_pdf_segment(pdf_segment, pdf_images_list[0].pdf_features.pages).to_dict()
for pdf_segment in predicted_segments
]