Spaces:
Running
Running
| from pdf_token_type_labels import TokenType | |
| from ports.services.text_extraction_service import TextExtractionService | |
| from configuration import service_logger | |
| class TextExtractionAdapter(TextExtractionService): | |
| def extract_text_by_types(self, segment_boxes: list[dict], token_types: list[TokenType]) -> dict: | |
| service_logger.info(f"Extracted types: {[t.name for t in token_types]}") | |
| text = "\n".join( | |
| [ | |
| segment_box["text"] | |
| for segment_box in segment_boxes | |
| if TokenType.from_text(segment_box["type"].replace(" ", "_")) in token_types | |
| ] | |
| ) | |
| return text | |
| def extract_all_text(self, segment_boxes: list[dict]) -> dict: | |
| all_types = [t for t in TokenType] | |
| return self.extract_text_by_types(segment_boxes, all_types) | |