Spaces:
Build error
Build error
| from unimernet.common.registry import registry | |
| from omegaconf import OmegaConf | |
| import albumentations as alb | |
| from albumentations.pytorch import ToTensorV2 | |
| from unimernet.processors.base_processor import BaseProcessor | |
| import numpy as np | |
| import cv2 | |
| from PIL import Image, ImageOps | |
| from torchvision.transforms.functional import resize | |
| import random | |
| from unimernet.processors.formula_processor_helper.nougat import Bitmap, Dilation, Erosion | |
| from unimernet.processors.formula_processor_helper.weather import Fog, Frost, Snow, Rain, Shadow | |
| class FormulaImageBaseProcessor(BaseProcessor): | |
| def __init__(self, image_size): | |
| super(FormulaImageBaseProcessor, self).__init__() | |
| self.input_size = [int(_) for _ in image_size] | |
| assert len(self.input_size) == 2 | |
| def crop_margin(img: Image.Image) -> Image.Image: | |
| data = np.array(img.convert("L")) | |
| data = data.astype(np.uint8) | |
| max_val = data.max() | |
| min_val = data.min() | |
| if max_val == min_val: | |
| return img | |
| data = (data - min_val) / (max_val - min_val) * 255 | |
| gray = 255 * (data < 200).astype(np.uint8) | |
| coords = cv2.findNonZero(gray) # Find all non-zero points (text) | |
| a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box | |
| return img.crop((a, b, w + a, h + b)) | |
| def prepare_input(self, img: Image.Image, random_padding: bool = False): | |
| """ | |
| Convert PIL Image to tensor according to specified input_size after following steps below: | |
| - resize | |
| - rotate (if align_long_axis is True and image is not aligned longer axis with canvas) | |
| - pad | |
| """ | |
| if img is None: | |
| return | |
| # crop margins | |
| try: | |
| img = self.crop_margin(img.convert("RGB")) | |
| except OSError: | |
| # might throw an error for broken files | |
| return | |
| if img.height == 0 or img.width == 0: | |
| return | |
| img = resize(img, min(self.input_size)) | |
| img.thumbnail((self.input_size[1], self.input_size[0])) | |
| delta_width = self.input_size[1] - img.width | |
| delta_height = self.input_size[0] - img.height | |
| if random_padding: | |
| pad_width = np.random.randint(low=0, high=delta_width + 1) | |
| pad_height = np.random.randint(low=0, high=delta_height + 1) | |
| else: | |
| pad_width = delta_width // 2 | |
| pad_height = delta_height // 2 | |
| padding = ( | |
| pad_width, | |
| pad_height, | |
| delta_width - pad_width, | |
| delta_height - pad_height, | |
| ) | |
| return ImageOps.expand(img, padding) | |
| class FormulaImageTrainProcessor(FormulaImageBaseProcessor): | |
| def __init__(self, image_size=384): | |
| super().__init__(image_size) | |
| self.transform = alb.Compose( | |
| [ | |
| alb.Compose( | |
| [ | |
| Bitmap(p=0.05), | |
| alb.OneOf([Fog(), Frost(), Snow(), Rain(), Shadow()], p=0.2), | |
| alb.OneOf([Erosion((2, 3)), Dilation((2, 3))], p=0.2), | |
| alb.ShiftScaleRotate(shift_limit=0, scale_limit=(-.15, 0), rotate_limit=1, border_mode=0, | |
| interpolation=3, | |
| value=[255, 255, 255], | |
| p=1), | |
| alb.GridDistortion(distort_limit=0.1, border_mode=0, interpolation=3, value=[255, 255, 255], | |
| p=.5)], | |
| p=.15), | |
| # alb.InvertImg(p=.15), | |
| alb.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3), | |
| alb.GaussNoise(10, p=.2), | |
| alb.RandomBrightnessContrast(.05, (-.2, 0), True, p=0.2), | |
| alb.ImageCompression(95, p=.3), | |
| alb.ToGray(always_apply=True), | |
| alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)), | |
| # alb.Sharpen() | |
| ToTensorV2(), | |
| ] | |
| ) | |
| def __call__(self, item): | |
| img = self.prepare_input(item, random_padding=True) | |
| if img is None: | |
| return img | |
| return self.transform(image=np.array(img))['image'][:1] | |
| def from_config(cls, cfg=None): | |
| if cfg is None: | |
| cfg = OmegaConf.create() | |
| image_size = cfg.get("image_size", [384, 384]) | |
| return cls( | |
| image_size=image_size, | |
| ) | |
| class FormulaImageMultiScaleTrainProcessor(FormulaImageTrainProcessor): | |
| def __init__(self, all_scales): | |
| for i, scales in enumerate(all_scales): | |
| all_scales[i] = [int(_) for _ in scales] | |
| super(FormulaImageMultiScaleTrainProcessor, self).__init__(all_scales[0]) | |
| self.all_scales = all_scales | |
| def from_config(cls, cfg=None): | |
| if cfg is None: | |
| cfg = OmegaConf.create() | |
| all_scales = cfg.get("all_scales", [[384, 384]]) | |
| return cls( | |
| all_scales=all_scales | |
| ) | |
| def reset_scale(self): | |
| self.input_size = random.choice(self.all_scales) | |
| class FormulaImageEvalProcessor(FormulaImageBaseProcessor): | |
| def __init__(self, image_size): | |
| super().__init__(image_size) | |
| self.transform = alb.Compose( | |
| [ | |
| alb.ToGray(always_apply=True), | |
| alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)), | |
| # alb.Sharpen() | |
| ToTensorV2(), | |
| ] | |
| ) | |
| def __call__(self, item): | |
| image = self.prepare_input(item) | |
| return self.transform(image=np.array(image))['image'][:1] | |
| def from_config(cls, cfg=None): | |
| if cfg is None: | |
| cfg = OmegaConf.create() | |
| image_size = cfg.get("image_size", [384, 384]) | |
| return cls(image_size=image_size) | |