Table-Detection-and-Recognition-using-DETR

Runtime error

App Files Files Community

Table-Detection-and-Recognition-using-DETR / codes /data_extraction.py

Abijith

Upload 5 files

38744b1 about 2 years ago

raw

history blame contribute delete

2.63 kB

	import os
	import re
	import pytesseract
	from pytesseract import Output
	from datatypes.datatypes import Row, Cell
	from codes.image_processing import ImageProcessor
	from datatypes.config import Config

	class TextDataExtraction():
	def __init__(self):
	pass

	def clean_ocr_data(self, value):
	transf = ''.join(e for e in value if e==' 'or e=='.' or e.isalnum())
	transf.strip()
	return transf

	def pytess(self, cell_pil_img):
	return ' '.join(pytesseract.image_to_data(cell_pil_img, output_type=Output.DICT, config='-c tessedit_char_blacklist=œ˜â€œï¬â™Ã©œ¢!\|”?«“¥ --psm 6 preserve_interword_spaces')['text']).strip()

	def cell_data_extraction(self, image, table_data):
	for table in table_data.tables:
	tableimg_processor = ImageProcessor()
	table_bbox = table.detection_box
	table_image = image.crop(table_bbox)
	table_image = tableimg_processor.image_padding(table_image, padd=Config['table_padd'])

	for row_idx, table_row in enumerate(table.ordered_recognitiondata[0].recognized_row):
	row_obj = Row([])
	xmin_row, ymin_row, xmax_row, ymax_row, _, _ = table_row

	row_image = table_image.crop((xmin_row,ymin_row,xmax_row,ymax_row))
	row_width, row_height = row_image.size
	row_obj.rowindex = row_idx

	# Cell bounding box creation
	xa, ya, xb, yb = 0, 0, 0, row_height

	for indx, table_column in enumerate(table.ordered_recognitiondata[0].recognized_column):
	cell_obj = Cell()
	xmin_col, _, xmax_col, _,_,_ = table_column
	xmin_col, xmax_col = xmin_col -Config['table_padd'], xmax_col - Config['table_padd']
	xa = xmin_col
	xb = xmax_col
	if indx == 0:
	xa = 0
	if indx == len(table.ordered_recognitiondata[0].recognized_column)-1:
	xb = row_width

	cell_img = row_image.crop((xa, ya, xb, yb))
	xa, ya, xb, yb = xa, ya, xb, yb

	cell_value = self.pytess(cell_img)
	transformed_cell_value = self.clean_ocr_data(cell_value)

	cell_obj.cellindex = indx
	cell_obj.value = transformed_cell_value

	row_obj.extracted_cells.append(cell_obj)
	table.extracted_rows.append(row_obj)

	return table_data