Spaces:
Runtime error
Runtime error
| from typing import List, Union | |
| from vision_functions import find_in_image, simple_qa, verify_property, best_text_match, compute_depth | |
| def bool_to_yesno(bool_answer: bool) -> str: | |
| return "yes" if bool_answer else "no" | |
| class ImagePatch: | |
| """A Python class containing a crop of an image centered around a particular object, as well as relevant information. | |
| Attributes | |
| ---------- | |
| cropped_image : array_like | |
| An array-like of the cropped image taken from the original image. | |
| left : int | |
| An int describing the position of the left border of the crop's bounding box in the original image. | |
| lower : int | |
| An int describing the position of the bottom border of the crop's bounding box in the original image. | |
| right : int | |
| An int describing the position of the right border of the crop's bounding box in the original image. | |
| upper : int | |
| An int describing the position of the top border of the crop's bounding box in the original image. | |
| Methods | |
| ------- | |
| find(object_name: str) -> List[ImagePatch] | |
| Returns a list of new ImagePatch objects containing crops of the image centered around any objects found in the image matching the object_name. | |
| simple_query(question: str=None) -> str | |
| Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?". | |
| exists(object_name: str) -> bool | |
| Returns True if the object specified by object_name is found in the image, and False otherwise. | |
| verify_property(property: str) -> bool | |
| Returns True if the property is met, and False otherwise. | |
| compute_depth()->float | |
| Returns the median depth of the image crop. | |
| best_text_match(string1: str, string2: str) -> str | |
| Returns the string that best matches the image. | |
| crop(left: int, lower: int, right: int, upper: int) -> ImagePatch | |
| Returns a new ImagePatch object containing a crop of the image at the given coordinates. | |
| """ | |
| def __init__(self, image, left: int = None, lower: int = None, right: int = None, upper: int = None): | |
| """Initializes an ImagePatch object by cropping the image at the given coordinates and stores the coordinates as attributes. | |
| If no coordinates are provided, the image is left unmodified, and the coordinates are set to the dimensions of the image. | |
| Parameters | |
| ------- | |
| image : array_like | |
| An array-like of the original image. | |
| left : int | |
| An int describing the position of the left border of the crop's bounding box in the original image. | |
| lower : int | |
| An int describing the position of the bottom border of the crop's bounding box in the original image. | |
| right : int | |
| An int describing the position of the right border of the crop's bounding box in the original image. | |
| upper : int | |
| An int describing the position of the top border of the crop's bounding box in the original image. | |
| """ | |
| if left is None and right is None and upper is None and lower is None: | |
| self.cropped_image = image | |
| self.left = 0 | |
| self.lower = 0 | |
| self.right = image.shape[2] # width | |
| self.upper = image.shape[1] # height | |
| else: | |
| self.cropped_image = image[:, lower:upper, left:right] | |
| self.left = left | |
| self.upper = upper | |
| self.right = right | |
| self.lower = lower | |
| self.width = self.cropped_image.shape[2] | |
| self.height = self.cropped_image.shape[1] | |
| self.horizontal_center = (self.left + self.right) / 2 | |
| self.vertical_center = (self.lower + self.upper) / 2 | |
| def find(self, object_name: str) -> List["ImagePatch"]: | |
| """Returns a new ImagePatch object containing the crop of the image centered around the object specified by object_name. | |
| Parameters | |
| ------- | |
| object_name : str | |
| A string describing the name of the object to be found in the image. | |
| Examples | |
| -------- | |
| >>> # Given an image: Find the foo. | |
| >>> def execute_command(image) -> List[ImagePatch]: | |
| >>> image_patch = ImagePatch(image) | |
| >>> foo_patches = image_patch.find("foo") | |
| >>> return foo_patches | |
| """ | |
| return find_in_image(self.cropped_image, object_name) | |
| def simple_query(self, question: str = None) -> str: | |
| """Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?". | |
| Parameters | |
| ------- | |
| question : str | |
| A string describing the question to be asked. | |
| Examples | |
| ------- | |
| >>> # Given an image: Which kind of animal is not eating? | |
| >>> def execute_command(image) -> str: | |
| >>> image_patch = ImagePatch(image) | |
| >>> animal_patches = image_patch.find("animal") | |
| >>> for animal_patch in animal_patches: | |
| >>> if not animal_patch.verify_property("animal", "eating"): | |
| >>> return animal_patch.simple_query("What kind of animal is eating?") # crop would include eating so keep it in the query | |
| >>> # If no animal is not eating, query the image directly | |
| >>> return image_patch.simple_query("Which kind of animal is not eating?") | |
| >>> # Given an image: What is in front of the horse? | |
| >>> def execute_command(image) -> str: | |
| >>> image_patch = ImagePatch(image) | |
| >>> # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly | |
| >>> return image_patch.simple_query("What is in front of the horse?") | |
| """ | |
| return simple_qa(self.cropped_image, question) | |
| def exists(self, object_name: str) -> bool: | |
| """Returns True if the object specified by object_name is found in the image, and False otherwise. | |
| Parameters | |
| ------- | |
| object_name : str | |
| A string describing the name of the object to be found in the image. | |
| Examples | |
| ------- | |
| >>> # Given an image: Are there both cakes and gummy bears in the photo? | |
| >>> def execute_command(image) -> str: | |
| >>> image_patch = ImagePatch(image) | |
| >>> is_cake = image_patch.exists("cake") | |
| >>> is_gummy_bear = image_patch.exists("gummy bear") | |
| >>> return bool_to_yesno(is_cake and is_gummy_bear) | |
| """ | |
| return len(self.find(object_name)) > 0 | |
| def verify_property(self, object_name: str, property: str) -> bool: | |
| """Returns True if the object possesses the property, and False otherwise. | |
| Differs from 'exists' in that it presupposes the existence of the object specified by object_name, instead checking whether the object possesses the property. | |
| Parameters | |
| ------- | |
| object_name : str | |
| A string describing the name of the object to be found in the image. | |
| property : str | |
| A string describing the property to be checked. | |
| Examples | |
| ------- | |
| >>> # Given an image: Do the letters have blue color? | |
| >>> def execute_command(image) -> str: | |
| >>> image_patch = ImagePatch(image) | |
| >>> letters_patches = image_patch.find("letters") | |
| >>> # Question assumes only one letter patch | |
| >>> if len(letters_patches) == 0: | |
| >>> # If no letters are found, query the image directly | |
| >>> return image_patch.simple_query("Do the letters have blue color?") | |
| >>> return bool_to_yesno(letters_patches[0].verify_property("letters", "blue")) | |
| """ | |
| return verify_property(self.cropped_image, object_name, property) | |
| def compute_depth(self): | |
| """Returns the median depth of the image crop | |
| Parameters | |
| ---------- | |
| Returns | |
| ------- | |
| float | |
| the median depth of the image crop | |
| Examples | |
| -------- | |
| >>> # Given an image: Find the bar furthest away. | |
| >>> def execute_command(image)->ImagePatch: | |
| >>> image_patch = ImagePatch(image) | |
| >>> bar_patches = image_patch.find("bar") | |
| >>> bar_patches.sort(key=lambda bar: bar.compute_depth()) | |
| >>> return bar_patches[-1] | |
| """ | |
| depth_map = compute_depth(self.cropped_image) | |
| return depth_map.median() | |
| def best_text_match(self, option_list: List[str]) -> str: | |
| """Returns the string that best matches the image. | |
| Parameters | |
| ------- | |
| option_list : str | |
| A list with the names of the different options | |
| prefix : str | |
| A string with the prefixes to append to the options | |
| Examples | |
| ------- | |
| >>> # Given an image: Is the cap gold or white? | |
| >>> def execute_command(image) -> str: | |
| >>> image_patch = ImagePatch(image) | |
| >>> cap_patches = image_patch.find("cap") | |
| >>> # Question assumes one cap patch | |
| >>> if len(cap_patches) == 0: | |
| >>> # If no cap is found, query the image directly | |
| >>> return image_patch.simple_query("Is the cap gold or white?") | |
| >>> return cap_patches[0].best_text_match(["gold", "white"]) | |
| """ | |
| return best_text_match(self.cropped_image, option_list) | |
| def crop(self, left: int, lower: int, right: int, upper: int) -> "ImagePatch": | |
| """Returns a new ImagePatch cropped from the current ImagePatch. | |
| Parameters | |
| ------- | |
| left : int | |
| The leftmost pixel of the cropped image. | |
| lower : int | |
| The lowest pixel of the cropped image. | |
| right : int | |
| The rightmost pixel of the cropped image. | |
| upper : int | |
| The uppermost pixel of the cropped image. | |
| ------- | |
| """ | |
| return ImagePatch(self.cropped_image, left, lower, right, upper) | |
| def best_image_match(list_patches: List[ImagePatch], content: List[str], return_index=False) -> Union[ImagePatch, int]: | |
| """Returns the patch most likely to contain the content. | |
| Parameters | |
| ---------- | |
| list_patches : List[ImagePatch] | |
| content : List[str] | |
| the object of interest | |
| return_index : bool | |
| if True, returns the index of the patch most likely to contain the object | |
| Returns | |
| ------- | |
| int | |
| Patch most likely to contain the object | |
| """ | |
| return best_image_match(list_patches, content, return_index) | |
| def distance(patch_a: ImagePatch, patch_b: ImagePatch) -> float: | |
| """ | |
| Returns the distance between the edges of two ImagePatches. If the patches overlap, it returns a negative distance | |
| corresponding to the negative intersection over union. | |
| Parameters | |
| ---------- | |
| patch_a : ImagePatch | |
| patch_b : ImagePatch | |
| Examples | |
| -------- | |
| # Return the qux that is closest to the foo | |
| >>> def execute_command(image): | |
| >>> image_patch = ImagePatch(image) | |
| >>> qux_patches = image_patch.find('qux') | |
| >>> foo_patches = image_patch.find('foo') | |
| >>> foo_patch = foo_patches[0] | |
| >>> qux_patches.sort(key=lambda x: distance(x, foo_patch)) | |
| >>> return qux_patches[0] | |
| """ | |
| return distance(patch_a, patch_b) | |
| # Examples of using ImagePatch | |
| # Given an image: What toy is wearing a shirt? | |
| def execute_command(image) -> str: | |
| # not a relational verb so go step by step | |
| image_patch = ImagePatch(image) | |
| toy_patches = image_patch.find("toy") | |
| # Question assumes only one toy patch | |
| if len(toy_patches) == 0: | |
| # If no toy is found, query the image directly | |
| return image_patch.simple_query("What toy is wearing a shirt?") | |
| for toy_patch in toy_patches: | |
| is_wearing_shirt = (toy_patch.simple_query("Is the toy wearing a shirt?") == "yes") | |
| if is_wearing_shirt: | |
| return toy_patch.simple_query( | |
| "What toy is wearing a shirt?") # crop would include the shirt so keep it in the query | |
| # If no toy is wearing a shirt, pick the first toy | |
| return toy_patches[0].simple_query("What toy is wearing a shirt?") | |
| # Given an image: Who is the man staring at? | |
| def execute_command(image) -> str: | |
| # asks for the predicate of a relational verb (staring at), so ask directly | |
| image_patch = ImagePatch(image) | |
| return image_patch.simple_query("Who is the man staring at?") | |
| # Given an image: Find more visible chair. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the chair | |
| image_patch = ImagePatch(image) | |
| # Remember: return the chair | |
| return image_patch.find("chair")[0] | |
| # Given an image: Find lamp on the bottom. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the lamp | |
| image_patch = ImagePatch(image) | |
| lamp_patches = image_patch.find("lamp") | |
| lamp_patches.sort(key=lambda lamp: lamp.vertical_center) | |
| # Remember: return the lamp | |
| return lamp_patches[0] # Return the bottommost lamp | |
| # Given a list of images: Does the pole that is near a building that is near a green sign and the pole that is near bushes that are near a green sign have the same material? | |
| def execute_command(image_list) -> str: | |
| material_1 = None | |
| material_2 = None | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| # find the building | |
| building_patches = image.find("building") | |
| for building_patch in building_patches: | |
| poles = building_patch.find("pole") | |
| signs = building_patch.find("sign") | |
| greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')] | |
| if len(poles) > 0 and len(greensigns) > 0: | |
| material_1 = poles[0].simple_query("What is the material of the pole?") | |
| # find the bush | |
| bushes_patches = image.find("bushes") | |
| for bushes_patch in bushes_patches: | |
| poles = bushes_patch.find("pole") | |
| signs = bushes_patch.find("sign") | |
| greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')] | |
| if len(poles) > 0 and len(greensigns) > 0: | |
| material_2 = poles[0].simple_query("What is the material of the pole?") | |
| return bool_to_yesno(material_1 == material_2) | |
| # Given an image: Find middle kid. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the kid | |
| image_patch = ImagePatch(image) | |
| kid_patches = image_patch.find("kid") | |
| if len(kid_patches) == 0: | |
| kid_patches = [image_patch] | |
| kid_patches.sort(key=lambda kid: kid.horizontal_center) | |
| # Remember: return the kid | |
| return kid_patches[len(kid_patches) // 2] # Return the middle kid | |
| # Given an image: Is that blanket to the right of a pillow? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| blanket_patches = image_patch.find("blanket") | |
| # Question assumes only one blanket patch | |
| if len(blanket_patches) == 0: | |
| # If no blanket is found, query the image directly | |
| return image_patch.simple_query("Is that blanket to the right of a pillow?") | |
| for blanket_patch in blanket_patches: | |
| pillow_patches = image_patch.find("pillow") | |
| for pillow_patch in pillow_patches: | |
| if pillow_patch.horizontal_center > blanket_patch.horizontal_center: | |
| return "yes" | |
| return "no" | |
| # Given an image: How many people are there? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| person_patches = image_patch.find("person") | |
| return str(len(person_patches)) | |
| # Given a list of images: Is the man that is wearing dark pants driving?. | |
| def execute_command(image_list) -> str: | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| man_patches = image.find("man") | |
| for man_patch in man_patches: | |
| pants = man_patch.find("pants") | |
| if len(pants) == 0: | |
| continue | |
| if pants[0].verify_property("pants", "dark"): | |
| return man_patch.simple_query("Is this man driving?") | |
| return ImagePatch(image_list[0]).simple_query("Is the man that is wearing dark pants driving?") | |
| # Given an image: Is there a backpack to the right of the man? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| man_patches = image_patch.find("man") | |
| # Question assumes one man patch | |
| if len(man_patches) == 0: | |
| # If no man is found, query the image directly | |
| return image_patch.simple_query("Is there a backpack to the right of the man?") | |
| man_patch = man_patches[0] | |
| backpack_patches = image_patch.find("backpack") | |
| # Question assumes one backpack patch | |
| if len(backpack_patches) == 0: | |
| return "no" | |
| for backpack_patch in backpack_patches: | |
| if backpack_patch.horizontal_center > man_patch.horizontal_center: | |
| return "yes" | |
| return "no" | |
| # Given a list of images: What is the pizza with red tomato on it on? | |
| def execute_command(image_list) -> str: | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| pizza_patches = image.find("pizza") | |
| for pizza_patch in pizza_patches: | |
| tomato_patches = pizza_patch.find("tomato") | |
| has_red_tomato = False | |
| for tomato_patch in tomato_patches: | |
| if tomato_patch.verify_property("tomato", "red"): | |
| has_red_tomato = True | |
| if has_red_tomato: | |
| return pizza_patch.simple_query("What is the pizza on?") | |
| return ImagePatch(image_list[0]).simple_query("What is the pizza with red tomato on it on?") | |
| # Given an image: Find chair to the right near the couch. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the chair | |
| image_patch = ImagePatch(image) | |
| chair_patches = image_patch.find("chair") | |
| if len(chair_patches) == 0: | |
| chair_patches = [image_patch] | |
| elif len(chair_patches) == 1: | |
| return chair_patches[0] | |
| chair_patches_right = [c for c in chair_patches if c.horizontal_center > image_patch.horizontal_center] | |
| couch_patches = image_patch.find("couch") | |
| if len(couch_patches) == 0: | |
| couch_patches = [image_patch] | |
| couch_patch = couch_patches[0] | |
| chair_patches_right.sort(key=lambda c: distance(c, couch_patch)) | |
| chair_patch = chair_patches_right[0] | |
| # Remember: return the chair | |
| return chair_patch | |
| # Given an image: Are there bagels or lemons? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| is_bagel = image_patch.exists("bagel") | |
| is_lemon = image_patch.exists("lemon") | |
| return bool_to_yesno(is_bagel or is_lemon) | |
| # Given an image: In which part is the bread, the bottom or the top? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| bread_patches = image_patch.find("bread") | |
| # Question assumes only one bread patch | |
| if len(bread_patches) == 0: | |
| # If no bread is found, query the image directly | |
| return image_patch.simple_query("In which part is the bread, the bottom or the top?") | |
| if bread_patches[0].vertical_center < image_patch.vertical_center: | |
| return "bottom" | |
| else: | |
| return "top" | |
| # Given an image: Find foo to bottom left. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the foo | |
| image_patch = ImagePatch(image) | |
| foo_patches = image_patch.find("foo") | |
| lowermost_coordinate = min([patch.vertical_center for patch in foo_patches]) | |
| foo_patches_bottom = [patch for patch in foo_patches if patch.vertical_center - lowermost_coordinate < 100] | |
| if len(foo_patches_bottom) == 0: | |
| foo_patches_bottom = foo_patches | |
| elif len(foo_patches_bottom) == 1: | |
| return foo_patches_bottom[0] | |
| foo_patches_bottom.sort(key=lambda foo: foo.horizontal_center) | |
| foo_patch = foo_patches_bottom[0] | |
| # Remember: return the foo | |
| return foo_patch | |
| # Given an image: Find number 17. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the person | |
| image_patch = ImagePatch(image) | |
| person_patches = image_patch.find("person") | |
| for patch in person_patches: | |
| if patch.exists("17"): | |
| return patch | |
| # Remember: return the person | |
| return person_patches[0] | |
| # Given a list of images: Is the statement true? There is at least 1 image with a brown dog that is near a bicycle and is wearing a collar. | |
| def execute_command(image_list) -> str: | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| dog_patches = image.find("dog") | |
| for dog in dog_patches: | |
| near_bicycle = dog.simple_query("Is the dog near a bicycle?") | |
| wearing_collar = dog.simple_query("Is the dog wearing a collar?") | |
| if near_bicycle == "yes" and wearing_collar == "yes": | |
| return 'yes' | |
| return 'no' | |
| # Given an image: Find dog to the left of the post who is closest to girl wearing a shirt with text that says "I love you". | |
| def execute_command(image) -> ImagePatch: | |
| # Return the dog | |
| image_patch = ImagePatch(image) | |
| shirt_patches = image_patch.find("shirt") | |
| if len(shirt_patches) == 0: | |
| shirt_patches = [image_patch] | |
| shirt_patch = best_image_match(list_patches=shirt_patches, content=["I love you shirt"]) | |
| post_patches = image_patch.find("post") | |
| post_patches.sort(key=lambda post: distance(post, shirt_patch)) | |
| post_patch = post_patches[0] | |
| dog_patches = image_patch.find("dog") | |
| dogs_left_patch = [dog for dog in dog_patches if dog.left < post_patch.left] | |
| if len(dogs_left_patch) == 0: | |
| dogs_left_patch = dog_patches | |
| dogs_left_patch.sort(key=lambda dog: distance(dog, post_patch)) | |
| dog_patch = dogs_left_patch[0] | |
| # Remember: return the dog | |
| return dog_patch | |
| # Given an image: Find balloon on the right and second from the bottom. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the balloon | |
| image_patch = ImagePatch(image) | |
| balloon_patches = image_patch.find("balloon") | |
| if len(balloon_patches) == 0: | |
| balloon_patches = [image_patch] | |
| elif len(balloon_patches) == 1: | |
| return balloon_patches[0] | |
| leftmost_coordinate = min([patch.horizontal_center for patch in balloon_patches]) | |
| balloon_patches_right = [patch for patch in balloon_patches if patch.horizontal_center - leftmost_coordinate < 100] | |
| if len(balloon_patches_right) == 0: | |
| balloon_patches_right = balloon_patches | |
| balloon_patches_right.sort(key=lambda p: p.vertical_center) | |
| balloon_patch = balloon_patches_right[1] | |
| # Remember: return the balloon | |
| return balloon_patch | |
| # Given an image: Find girl in white next to man in left. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the girl | |
| image_patch = ImagePatch(image) | |
| girl_patches = image_patch.find("girl") | |
| girl_in_white_patches = [g for g in girl_patches if g.verify_property("girl", "white clothing")] | |
| if len(girl_in_white_patches) == 0: | |
| girl_in_white_patches = girl_patches | |
| man_patches = image_patch.find("man") | |
| man_patches.sort(key=lambda man: man.horizontal_center) | |
| leftmost_man = man_patches[0] # First from the left | |
| girl_in_white_patches.sort(key=lambda girl: distance(girl, leftmost_man)) | |
| girl_patch = girl_in_white_patches[0] | |
| # Remember: return the girl | |
| return girl_patch | |
| # Given a list of images: Is the statement true? There is 1 table that is in front of woman that is wearing jacket. | |
| def execute_command(image_list) -> str: | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| woman_patches = image.find("woman") | |
| for woman in woman_patches: | |
| if woman.simple_query("Is the woman wearing jacket?") == "yes": | |
| tables = woman.find("table") | |
| return bool_to_yesno(len(tables) == 1) | |
| return 'no' | |
| # Given an image: Find top left. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the person | |
| image_patch = ImagePatch(image) | |
| # Figure out what thing the caption is referring to. We need a subject for every caption | |
| persons = image_patch.find("person") | |
| top_all_objects = max([obj.vertical_center for obj in persons]) | |
| # Select objects that are close to the top | |
| # We do this because the caption is asking first about vertical and then about horizontal | |
| persons_top = [p for p in persons if top_all_objects - p.vertical_center < 100] | |
| if len(persons_top) == 0: | |
| persons_top = persons | |
| # And after that, obtain the leftmost object among them | |
| persons_top.sort(key=lambda obj: obj.horizontal_center) | |
| person_leftmost = persons_top[0] | |
| # Remember: return the person | |
| return person_leftmost | |
| # Given an image: What type of weather do you see in the photograph? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| return image_patch.simple_query("What type of weather do you see in the photograph?") | |
| # Given an image: How many orange life vests can be seen? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| life_vest_patches = image_patch.find("life vest") | |
| orange_life_vest_patches = [] | |
| for life_vest_patch in life_vest_patches: | |
| if life_vest_patch.verify_property('life vest', 'orange'): | |
| orange_life_vest_patches.append(life_vest_patch) | |
| return str(len(orange_life_vest_patches)) | |
| # Given an image: What is behind the pole? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly | |
| return image_patch.simple_query("What is behind the pole?") | |
| # Given an image: Find second to top flower. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the flower | |
| image_patch = ImagePatch(image) | |
| flower_patches = image_patch.find("flower") | |
| flower_patches.sort(key=lambda flower: flower.vertical_center) | |
| flower_patch = flower_patches[-2] | |
| # Remember: return the flower | |
| return flower_patch | |
| # Given an image: Find back. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the person | |
| image_patch = ImagePatch(image) | |
| person_patches = image_patch.find("person") | |
| person_patches.sort(key=lambda person: person.compute_depth()) | |
| person_patch = person_patches[-1] | |
| # Remember: return the person | |
| return person_patch | |
| # Given an image: Find chair at the front. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the chair | |
| image_patch = ImagePatch(image) | |
| chair_patches = image_patch.find("chair") | |
| chair_patches.sort(key=lambda chair: chair.compute_depth()) | |
| chair_patch = chair_patches[0] | |
| # Remember: return the chair | |
| return chair_patch | |
| # Given an image: Find white and yellow pants. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the person | |
| image_patch = ImagePatch(image) | |
| # Clothing always requires returning the person | |
| person_patches = image_patch.find("person") | |
| person_patch = best_image_match(person_patches, ["white pants", "yellow pants"]) | |
| # Remember: return the person | |
| return person_patch | |
| # Given an image: Find cow facing the camera. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the cow | |
| image_patch = ImagePatch(image) | |
| cow_patches = image_patch.find("cow") | |
| if len(cow_patches) == 0: | |
| cow_patches = [image_patch] | |
| cow_patch = best_image_match(list_patches=cow_patches, content=["cow facing the camera"]) | |
| # Remember: return the cow | |
| return cow_patch | |
| # Given a list of images: Is the statement true? There is 1 image that contains exactly 3 blue papers. | |
| def execute_command(image_list) -> str: | |
| image_cnt = 0 | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| paper_patches = image.find("paper") | |
| blue_paper_patches = [] | |
| for paper in paper_patches: | |
| if paper.verify_property("paper", "blue"): | |
| blue_paper_patches.append(paper) | |
| if len(blue_paper_patches) == 3: | |
| image_cnt += 1 | |
| return bool_to_yesno(image_cnt == 1) | |
| # Given an image: Find black car just under stop sign. | |
| def execute_command(image) -> ImagePatch: | |
| # Return the car | |
| image_patch = ImagePatch(image) | |
| stop_sign_patches = image_patch.find("stop sign") | |
| if len(stop_sign_patches) == 0: | |
| stop_sign_patches = [image_patch] | |
| stop_sign_patch = stop_sign_patches[0] | |
| car_patches = image_patch.find("black car") | |
| car_under_stop = [] | |
| for car in car_patches: | |
| if car.upper < stop_sign_patch.upper: | |
| car_under_stop.append(car) | |
| # Find car that is closest to the stop sign | |
| car_under_stop.sort(key=lambda car: car.vertical_center - stop_sign_patch.vertical_center) | |
| # Remember: return the car | |
| return car_under_stop[0] | |
| # Given a list of images: Is there either a standing man that is holding a cell phone or a sitting man that is holding a cell phone? | |
| def execute_command(image_list) -> str: | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| man_patches = image.find("man") | |
| for man in man_patches: | |
| holding_cell_phone = man.simple_query("Is this man holding a cell phone?") | |
| if holding_cell_phone == "yes": | |
| if man.simple_query("Is this man sitting?") == "yes": | |
| return 'yes' | |
| if man.simple_query("Is this man standing?") == "yes": | |
| return 'yes' | |
| return 'no' | |
| # Given a list of images: How many people are running while looking at their cell phone? | |
| def execute_command(image) -> str: | |
| image_patch = ImagePatch(image) | |
| people_patches = image_patch.find("person") | |
| # Question assumes only one person patch | |
| if len(people_patches) == 0: | |
| # If no people are found, query the image directly | |
| return image_patch.simple_query("How many people are running while looking at their cell phone?") | |
| people_count = 0 | |
| for person_patch in people_patches: | |
| # Verify two conditions: (1) running (2) looking at cell phone | |
| if person_patch.simple_query("Is the person running?") == "yes": | |
| if person_patch.simple_query("Is the person looking at cell phone?") == "yes": | |
| people_count += 1 | |
| return str(people_count) | |
| # Given a list of images: Does the car that is on a highway and the car that is on a street have the same color? | |
| def execute_command(image_list) -> str: | |
| color_1 = None | |
| color_2 = None | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| car_patches = image.find("car") | |
| for car_patch in car_patches: | |
| if car_patch.simple_query("Is the car on the highway?") == "yes": | |
| color_1 = car_patch.simple_query("What is the color of the car?") | |
| elif car_patch.simple_query("Is the car on a street?") == "yes": | |
| color_2 = car_patch.simple_query("What is the color of the car?") | |
| return bool_to_yesno(color_1 == color_2) | |
| # Given a list of images: Is the statement true? There are 3 magazine that are on table. | |
| def execute_command(image_list) -> str: | |
| count = 0 | |
| for image in image_list: | |
| image = ImagePatch(image) | |
| magazine_patches = image.find("magazine") | |
| for magazine_patch in magazine_patches: | |
| on_table = magazine_patch.simple_query("Is the magazine on a table?") | |
| if on_table == "yes": | |
| count += 1 | |
| return bool_to_yesno(count == 3) | |
| # INSERT_QUERY_HERE |