Spaces:
Running
on
Zero
Running
on
Zero
| import torch | |
| import numpy as np | |
| import supervision as sv | |
| from PIL import Image | |
| CAPTIONING_TASK = "<DETAILED_CAPTION>" | |
| CAPTION_TO_PHRASE_GROUNDING_TASK = "<CAPTION_TO_PHRASE_GROUNDING>" | |
| def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str: | |
| image = Image.fromarray(image).convert("RGB") | |
| text = "<DETAILED_CAPTION>" | |
| inputs = processor(text=text, images=image, return_tensors="pt").to(device) | |
| generated_ids = model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| num_beams=3 | |
| ) | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
| return processor.post_process_generation( | |
| generated_text, task=CAPTIONING_TASK, image_size=image.size) | |
| def run_caption_to_phrase_grounding( | |
| model, | |
| processor, | |
| caption: str, | |
| image: np.ndarray, | |
| device: torch.device | |
| ) -> sv.Detections: | |
| image = Image.fromarray(image).convert("RGB") | |
| text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}" | |
| inputs = processor(text=text, images=image, return_tensors="pt").to(device) | |
| generated_ids = model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| num_beams=3 | |
| ) | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
| response = processor.post_process_generation( | |
| generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size) | |
| return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size) | |