Spaces:

tensorlake
/

document-extractors

Sleeping

App Files Files Community

rishiraj commited on Jun 7, 2024

Commit

5f95db3

verified ·

1 Parent(s): 558c701

Delete pdf-extractor

Browse files

Files changed (2) hide show

pdf-extractor/pdf_extractor.py +0 -52
pdf-extractor/utils/tt_module.py +0 -230

pdf-extractor/pdf_extractor.py DELETED Viewed

@@ -1,52 +0,0 @@
-from typing import List, Union, Optional
-import json
-from indexify_extractor_sdk import Content, Extractor, Feature
-from pydantic import BaseModel, Field
-from .utils.tt_module import get_tables
-import fitz
-import tempfile
-class PDFExtractorConfig(BaseModel):
-    output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"])
-class PDFExtractor(Extractor):
-    name = "tensorlake/pdf-extractor"
-    description = "PDF Extractor for Texts, Images & Tables"
-    system_dependencies = ["poppler-utils"]
-    input_mime_types = ["application/pdf"]
-    def __init__(self):
-        super(PDFExtractor, self).__init__()
-    def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]:
-        contents = []
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
-            inputtmpfile.write(content.data)
-            inputtmpfile.flush()
-            doc = fitz.open(inputtmpfile.name)
-            for i in range(len(doc)):
-                page = doc[i]
-                if "text" in params.output_types:
-                    page_text = page.get_text()
-                    feature = Feature.metadata(value={"type": "text", "page": i+1})
-                    contents.append(Content.from_text(page_text, features=[feature]))
-                if "image" in params.output_types:
-                    image_list = page.get_images()
-                    for img in image_list:
-                        xref = img[0]
-                        pix = fitz.Pixmap(doc, xref)
-                        if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
-                            pix = fitz.Pixmap(fitz.csRGB, pix)
-                        feature = Feature.metadata({"type": "image", "page": i+1})
-                        contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature]))
-            if "table" in params.output_types:
-                tables = get_tables(content.data)
-                for page, content in tables.items():
-                    feature = Feature.metadata({"type": "table", "page": int(page)})
-                    contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature]))
-        return contents

pdf-extractor/utils/tt_module.py DELETED Viewed

@@ -1,230 +0,0 @@
-from transformers import AutoModelForObjectDetection
-import torch
-from pdf2image import convert_from_bytes
-from torchvision import transforms
-from transformers import TableTransformerForObjectDetection
-import numpy as np
-import easyocr
-from tqdm.auto import tqdm
-model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")
-model.config.id2label
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
-structure_model.to(device)
-reader = easyocr.Reader(['en'])
-def pdf_to_img(pdf_path):
-    image_list = []
-    images = convert_from_bytes(pdf_path)
-    for i in range(len(images)):
-        image = images[i].convert("RGB")
-        image_list.append(image)
-    return image_list
-class MaxResize(object):
-    def __init__(self, max_size=800):
-        self.max_size = max_size
-    def __call__(self, image):
-        width, height = image.size
-        current_max_size = max(width, height)
-        scale = self.max_size / current_max_size
-        resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))
-        return resized_image
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=1)
-def rescale_bboxes(out_bbox, size):
-    img_w, img_h = size
-    b = box_cxcywh_to_xyxy(out_bbox)
-    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
-    return b
-def outputs_to_objects(outputs, img_size, id2label):
-    m = outputs.logits.softmax(-1).max(-1)
-    pred_labels = list(m.indices.detach().cpu().numpy())[0]
-    pred_scores = list(m.values.detach().cpu().numpy())[0]
-    pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
-    pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]
-    objects = []
-    for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
-        class_label = id2label[int(label)]
-        if not class_label == 'no object':
-            objects.append({'label': class_label, 'score': float(score),
-                            'bbox': [float(elem) for elem in bbox]})
-    return objects
-def objects_to_crops(img, tokens, objects, class_thresholds, padding=10):
-    """
-    Process the bounding boxes produced by the table detection model into
-    cropped table images and cropped tokens.
-    """
-    table_crops = []
-    for obj in objects:
-        if obj['score'] < class_thresholds[obj['label']]:
-            continue
-        cropped_table = {}
-        bbox = obj['bbox']
-        bbox = [bbox[0]-padding, bbox[1]-padding, bbox[2]+padding, bbox[3]+padding]
-        cropped_img = img.crop(bbox)
-        table_tokens = [token for token in tokens if iob(token['bbox'], bbox) >= 0.5]
-        for token in table_tokens:
-            token['bbox'] = [token['bbox'][0]-bbox[0],
-                             token['bbox'][1]-bbox[1],
-                             token['bbox'][2]-bbox[0],
-                             token['bbox'][3]-bbox[1]]
-        # If table is predicted to be rotated, rotate cropped image and tokens/words:
-        if obj['label'] == 'table rotated':
-            cropped_img = cropped_img.rotate(270, expand=True)
-            for token in table_tokens:
-                bbox = token['bbox']
-                bbox = [cropped_img.size[0]-bbox[3]-1,
-                        bbox[0],
-                        cropped_img.size[0]-bbox[1]-1,
-                        bbox[2]]
-                token['bbox'] = bbox
-        cropped_table['image'] = cropped_img
-        cropped_table['tokens'] = table_tokens
-        table_crops.append(cropped_table)
-    return table_crops
-def get_cell_coordinates_by_row(table_data):
-    # Extract rows and columns
-    rows = [entry for entry in table_data if entry['label'] == 'table row']
-    columns = [entry for entry in table_data if entry['label'] == 'table column']
-    # Sort rows and columns by their Y and X coordinates, respectively
-    rows.sort(key=lambda x: x['bbox'][1])
-    columns.sort(key=lambda x: x['bbox'][0])
-    # Function to find cell coordinates
-    def find_cell_coordinates(row, column):
-        cell_bbox = [column['bbox'][0], row['bbox'][1], column['bbox'][2], row['bbox'][3]]
-        return cell_bbox
-    # Generate cell coordinates and count cells in each row
-    cell_coordinates = []
-    for row in rows:
-        row_cells = []
-        for column in columns:
-            cell_bbox = find_cell_coordinates(row, column)
-            row_cells.append({'column': column['bbox'], 'cell': cell_bbox})
-        # Sort cells in the row by X coordinate
-        row_cells.sort(key=lambda x: x['column'][0])
-        # Append row information to cell_coordinates
-        cell_coordinates.append({'row': row['bbox'], 'cells': row_cells, 'cell_count': len(row_cells)})
-    # Sort rows from top to bottom
-    cell_coordinates.sort(key=lambda x: x['row'][1])
-    return cell_coordinates
-def apply_ocr(cell_coordinates, cropped_table):
-    # let's OCR row by row
-    data = dict()
-    max_num_columns = 0
-    for idx, row in enumerate(tqdm(cell_coordinates)):
-      row_text = []
-      for cell in row["cells"]:
-        # crop cell out of image
-        cell_image = np.array(cropped_table.crop(cell["cell"]))
-        # apply OCR
-        result = reader.readtext(np.array(cell_image))
-        if len(result) > 0:
-          # print([x[1] for x in list(result)])
-          text = " ".join([x[1] for x in result])
-          row_text.append(text)
-      if len(row_text) > max_num_columns:
-          max_num_columns = len(row_text)
-      data[idx] = row_text
-    print("Max number of columns:", max_num_columns)
-    # pad rows which don't have max_num_columns elements
-    # to make sure all rows have the same number of columns
-    for row, row_data in data.copy().items():
-        if len(row_data) != max_num_columns:
-          row_data = row_data + ["" for _ in range(max_num_columns - len(row_data))]
-        data[row] = row_data
-    return data
-def get_tables(pdf_path):
-    image_list = pdf_to_img(pdf_path)
-    data_dict = {}
-    for index, image in enumerate(image_list):
-        detection_transform = transforms.Compose([
-            MaxResize(800),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-        ])
-        pixel_values = detection_transform(image).unsqueeze(0)
-        pixel_values = pixel_values.to(device)
-        with torch.no_grad():
-            outputs = model(pixel_values)
-        id2label = model.config.id2label
-        id2label[len(model.config.id2label)] = "no object"
-        objects = outputs_to_objects(outputs, image.size, id2label)
-        tokens = []
-        detection_class_thresholds = {
-            "table": 0.5,
-            "table rotated": 0.5,
-            "no object": 10
-        }
-        crop_padding = 10
-        tables_crops = objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=0)
-        for table_index, table_crop in enumerate(tables_crops):
-            cropped_table = table_crop['image'].convert("RGB")
-            structure_transform = transforms.Compose([
-                MaxResize(1000),
-                transforms.ToTensor(),
-                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-            ])
-            pixel_values = structure_transform(cropped_table).unsqueeze(0)
-            pixel_values = pixel_values.to(device)
-            with torch.no_grad():
-                outputs = structure_model(pixel_values)
-            structure_id2label = structure_model.config.id2label
-            structure_id2label[len(structure_id2label)] = "no object"
-            cells = outputs_to_objects(outputs, cropped_table.size, structure_id2label)
-            if cells[0]['score'] > 0.95:
-                cell_coordinates = get_cell_coordinates_by_row(cells)
-                data = apply_ocr(cell_coordinates, cropped_table)
-                data_dict[f"{index+1}_{table_index+1}"] = data
-    return data_dict