Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| import torch | |
| from ultralytics import YOLO | |
| from PIL import Image | |
| import io | |
| import base64 | |
| device = 'cuda' | |
| from PIL import Image, ImageDraw, ImageFont | |
| import numpy as np | |
| import networkx as nx | |
| # import cv2 | |
| font_path = "util/arial.ttf" | |
| class MarkHelper: | |
| def __init__(self): | |
| self.markSize_dict = {} | |
| self.font_dict = {} | |
| self.min_font_size = 20 # 1 in v1 | |
| self.max_font_size = 30 | |
| self.max_font_proportion = 0.04 # 0.032 in v1 | |
| def __get_markSize(self, text, image_height, image_width, font): | |
| im = Image.new('RGB', (image_width, image_height)) | |
| draw = ImageDraw.Draw(im) | |
| _, _, width, height = draw.textbbox((0, 0), text=text, font=font) | |
| return height, width | |
| def _setup_new_font(self, image_height, image_width): | |
| key = f"{image_height}_{image_width}" | |
| # print(f"Setting up new font for image size: {key}") | |
| # setup the font | |
| fontsize = self.min_font_size | |
| font = ImageFont.truetype(font_path, fontsize) | |
| # font = ImageFont.load_default(size=fontsize) | |
| while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)): | |
| # iterate until the text size is just larger than the criteria | |
| fontsize += 1 | |
| font = ImageFont.truetype(font_path, fontsize) | |
| # font = ImageFont.load_default(size=fontsize) | |
| self.font_dict[key] = font | |
| # setup the markSize dict | |
| markSize_3digits = self.__get_markSize('555', image_height, image_width, font) | |
| markSize_2digits = self.__get_markSize('55', image_height, image_width, font) | |
| markSize_1digit = self.__get_markSize('5', image_height, image_width, font) | |
| self.markSize_dict[key] = { | |
| 1: markSize_1digit, | |
| 2: markSize_2digits, | |
| 3: markSize_3digits | |
| } | |
| def get_font(self, image_height, image_width): | |
| key = f"{image_height}_{image_width}" | |
| if key not in self.font_dict: | |
| self._setup_new_font(image_height, image_width) | |
| return self.font_dict[key] | |
| def get_mark_size(self, text_str, image_height, image_width): | |
| """Get the font size for the given image dimensions.""" | |
| key = f"{image_height}_{image_width}" | |
| if key not in self.markSize_dict: | |
| self._setup_new_font(image_height, image_width) | |
| largest_size = self.markSize_dict[key].get(3, None) | |
| text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long | |
| return text_h, text_w | |
| def __calculate_iou(box1, box2, return_area=False): | |
| """ | |
| Calculate the Intersection over Union (IoU) of two bounding boxes. | |
| :param box1: Tuple of (y, x, h, w) for the first bounding box | |
| :param box2: Tuple of (y, x, h, w) for the second bounding box | |
| :return: IoU value | |
| """ | |
| y1, x1, h1, w1 = box1 | |
| y2, x2, h2, w2 = box2 | |
| # Calculate the intersection area | |
| y_min = max(y1, y2) | |
| x_min = max(x1, x2) | |
| y_max = min(y1 + h1, y2 + h2) | |
| x_max = min(x1 + w1, x2 + w2) | |
| intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min) | |
| # Compute the area of both bounding boxes | |
| box1_area = h1 * w1 | |
| box2_area = h2 * w2 | |
| # Calculate the IoU | |
| # iou = intersection_area / box1_area + box2_area - intersection_area | |
| iou = intersection_area / (min(box1_area, box2_area) + 0.0001) | |
| if return_area: | |
| return iou, intersection_area | |
| return iou | |
| def __calculate_nearest_corner_distance(box1, box2): | |
| """Calculate the distance between the nearest edge or corner of two bounding boxes.""" | |
| y1, x1, h1, w1 = box1 | |
| y2, x2, h2, w2 = box2 | |
| corners1 = np.array([ | |
| [y1, x1], | |
| [y1, x1 + w1], | |
| [y1 + h1, x1], | |
| [y1 + h1, x1 + w1] | |
| ]) | |
| corners2 = np.array([ | |
| [y2, x2], | |
| [y2, x2 + w2], | |
| [y2 + h2, x2], | |
| [y2 + h2, x2 + w2] | |
| ]) | |
| # Calculate pairwise distances between corners | |
| distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2) | |
| # Find the minimum distance | |
| min_distance = np.min(distances) | |
| return min_distance | |
| def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size): | |
| """Find the corner with the least overlap with other bboxes. | |
| Args: | |
| bbox: (y, x, h, w) The bounding box to place the text on. | |
| bboxes: [(y, x, h, w)] The list of bounding boxes to compare against. | |
| drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on. | |
| text_size: (height, width) The size of the text to be drawn. | |
| image_size: (height, width) The size of the image. | |
| """ | |
| y, x, h, w = bbox | |
| h_text, w_text = text_size | |
| image_height, image_width = image_size | |
| corners = [ | |
| # top-left | |
| (y - h_text, x), | |
| # top-right | |
| (y - h_text, x + w - w_text), | |
| # right-top | |
| (y, x + w), | |
| # right-bottom | |
| (y + h - h_text, x + w), | |
| # bottom-right | |
| (y + h, x + w - w_text), | |
| # bottom-left | |
| (y + h, x), | |
| # left-bottom | |
| (y + h - h_text, x - w_text), | |
| # left-top | |
| (y, x - w_text), | |
| ] | |
| best_corner = corners[0] | |
| max_flag = float('inf') | |
| for corner in corners: | |
| corner_bbox = (corner[0], corner[1], h_text, w_text) | |
| # if the corner is out of the image, skip | |
| if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width: | |
| continue | |
| max_iou = - (image_width + image_height) | |
| # 找到关于这个角最差的 case | |
| # given the current corner, find the larget iou with other bboxes. | |
| for other_bbox in bboxes + drawn_boxes: | |
| if np.array_equal(bbox, other_bbox): | |
| continue | |
| iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1] | |
| max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox)) | |
| # the smaller the max_IOU, the better the corner | |
| # 取最差的值 相对最好的那个角 | |
| if max_iou < max_flag: | |
| max_flag = max_iou | |
| best_corner = corner | |
| return best_corner | |
| def plot_boxes_with_marks( | |
| image: Image.Image, | |
| bboxes, # (y, x, h, w) | |
| mark_helper: MarkHelper, | |
| linewidth=2, | |
| alpha=0, | |
| edgecolor=None, | |
| fn_save=None, | |
| normalized_to_pixel=True, | |
| add_mark=True | |
| ) -> np.ndarray: | |
| """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs. | |
| Args: | |
| image: The image to plot the bounding boxes on. | |
| bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel. | |
| """ | |
| # Then modify the drawing code | |
| draw = ImageDraw.Draw(image) | |
| # draw boxes on the image | |
| image_width, image_height = image.size | |
| if normalized_to_pixel: | |
| bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes] | |
| for box in bboxes: | |
| y, x, h, w = box | |
| draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth) | |
| # Draw the bounding boxes with index at the least overlapping corner | |
| drawn_boxes = [] | |
| for idx, bbox in enumerate(bboxes): | |
| text = str(idx) | |
| text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width) | |
| corner_y, corner_x = _find_least_overlapping_corner( | |
| bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width)) | |
| # Define the index box (y, x, y + h, x + w) | |
| text_box = (corner_y, corner_x, text_h, text_w) | |
| if add_mark: | |
| # Draw the filled index box and text | |
| draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h) | |
| fill="red") | |
| font = mark_helper.get_font(image_height, image_width) | |
| draw.text((corner_x, corner_y), text, fill='white', font=font) | |
| # Update the list of drawn boxes | |
| drawn_boxes.append(np.array(text_box)) | |
| if fn_save is not None: # PIL image | |
| image.save(fn_save) | |
| return image | |
| def plot_circles_with_marks( | |
| image: Image.Image, | |
| points, # (x, y) | |
| mark_helper: MarkHelper, | |
| linewidth=2, | |
| edgecolor=None, | |
| fn_save=None, | |
| normalized_to_pixel=True, | |
| add_mark=True | |
| ) -> np.ndarray: | |
| """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs. | |
| Args: | |
| image: The image to plot the bounding boxes on. | |
| bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel. | |
| """ | |
| # draw boxes on the image | |
| image_width, image_height = image.size | |
| if normalized_to_pixel: | |
| bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes] | |
| draw = ImageDraw.Draw(image) | |
| for point in points: | |
| x, y = point | |
| draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth) | |
| if fn_save is not None: # PIL image | |
| image.save(fn_save) | |
| return image | |
| markhelper = MarkHelper() | |
| BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5 | |
| BBOX_GROUPING_VERTICAL_THRESHOLD = 20 | |
| BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20 | |
| BBOX_AUG_TARGET = 2.0 | |
| def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold): | |
| """check if two boxes are in the same line or close enough to be considered together""" | |
| y1, x1, h1, w1 = bbox1 | |
| y2, x2, h2, w2 = bbox2 | |
| # Check if the boxes are close horizontally (consider the edge case where the boxes are touching) | |
| horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold) | |
| # Check if the boxes are close vertically (consider the edge case where the boxes are touching) | |
| vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold) | |
| # Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally | |
| return vertically_close and horizontally_close | |
| def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold): | |
| """Build the adjacency matrix based on the merging criteria.""" | |
| num_boxes = len(bboxes) | |
| A = np.zeros((num_boxes, num_boxes), dtype=int) | |
| for i in range(num_boxes): | |
| for j in range(i + 1, num_boxes): | |
| if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold): | |
| A[i, j] = 1 | |
| A[j, i] = 1 # Symmetric matrix | |
| return A | |
| def merge_connected_bboxes(bboxes, text_details, | |
| vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD, | |
| horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD | |
| ): | |
| """Merge bboxes based on the adjacency matrix and return merged bboxes. | |
| Args: | |
| bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width). | |
| text_details: A list of text details for each bounding box. | |
| vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line. | |
| horizontal_threshold: The maximum horizontal distance between two boxes to be considered close. | |
| """ | |
| # return if there are no bboxes | |
| if len(bboxes) <= 1: | |
| return bboxes, text_details | |
| # Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format | |
| bboxes = np.array(bboxes) | |
| bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T | |
| # Build adjacency matrix | |
| A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold) | |
| # Create graph from adjacency matrix | |
| G = nx.from_numpy_array(A) | |
| # Find connected components | |
| components = list(nx.connected_components(G)) | |
| # Convert bboxes to (y_min, x_min, y_max, x_max) format | |
| corners = np.copy(bboxes) | |
| corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3] | |
| corners_y_max = corners_y + corners_h | |
| corners_x_max = corners_x + corners_w | |
| # Merge bboxes for each connected component | |
| merged_bboxes = [] | |
| merged_text_details = [] | |
| for component in components: | |
| indices = list(component) # e.g., [32, 33, 34, 30, 31] | |
| indices = sorted(indices) | |
| # merge the text details | |
| merged_text_details.append(' '.join([text_details[i] for i in indices])) | |
| # merge the bboxes | |
| y_min = min(corners_y[i] for i in indices) | |
| x_min = min(corners_x[i] for i in indices) | |
| y_max = max(corners_y_max[i] for i in indices) | |
| x_max = max(corners_x_max[i] for i in indices) | |
| merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format | |
| # convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array | |
| merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes] | |
| return merged_bboxes, merged_text_details | 
 
			
