Spaces:
Running
Running
File size: 836 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from pdf_token_type_labels import TokenType
from ports.services.text_extraction_service import TextExtractionService
from configuration import service_logger
class TextExtractionAdapter(TextExtractionService):
def extract_text_by_types(self, segment_boxes: list[dict], token_types: list[TokenType]) -> dict:
service_logger.info(f"Extracted types: {[t.name for t in token_types]}")
text = "\n".join(
[
segment_box["text"]
for segment_box in segment_boxes
if TokenType.from_text(segment_box["type"].replace(" ", "_")) in token_types
]
)
return text
def extract_all_text(self, segment_boxes: list[dict]) -> dict:
all_types = [t for t in TokenType]
return self.extract_text_by_types(segment_boxes, all_types)
|