Spaces:
Runtime error
Runtime error
update cap
Browse files
app.py
CHANGED
|
@@ -2,8 +2,8 @@ import os
|
|
| 2 |
import sys
|
| 3 |
from pathlib import Path
|
| 4 |
# os.system("cd transformers && pip install .")
|
| 5 |
-
os.system("cd multimodal && pip install
|
| 6 |
-
os.system("cd multimodal/YOLOX && pip install .")
|
| 7 |
import numpy as np
|
| 8 |
import torch
|
| 9 |
from PIL import Image
|
|
|
|
| 2 |
import sys
|
| 3 |
from pathlib import Path
|
| 4 |
# os.system("cd transformers && pip install .")
|
| 5 |
+
# os.system("cd multimodal && pip install -e .")
|
| 6 |
+
# os.system("cd multimodal/YOLOX && pip install .")
|
| 7 |
import numpy as np
|
| 8 |
import torch
|
| 9 |
from PIL import Image
|
multimodal/open_flamingo/chat/conversation.py
CHANGED
|
@@ -324,7 +324,7 @@ class Chat:
|
|
| 324 |
repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
|
| 325 |
# conv.append_message(conv.roles[1], None)
|
| 326 |
# embs = self.get_context_emb(conv, img_list)
|
| 327 |
-
#
|
| 328 |
# # current_max_len = embs.shape[1] + max_new_tokens + 100
|
| 329 |
# # begin_idx = max(0, current_max_len - max_length)
|
| 330 |
# # embs = embs[:, begin_idx:]
|
|
@@ -494,7 +494,7 @@ class Chat:
|
|
| 494 |
# if len(image.shape) == 3:
|
| 495 |
# image = image.unsqueeze(0)
|
| 496 |
# # image = image.to(self.device)
|
| 497 |
-
#
|
| 498 |
# # image_emb, _ = self.model.encode_img(image)
|
| 499 |
# img_list.append(image_emb)
|
| 500 |
# conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
|
|
@@ -587,4 +587,3 @@ def evaluate_exp(
|
|
| 587 |
|
| 588 |
|
| 589 |
|
| 590 |
-
|
|
|
|
| 324 |
repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
|
| 325 |
# conv.append_message(conv.roles[1], None)
|
| 326 |
# embs = self.get_context_emb(conv, img_list)
|
| 327 |
+
#
|
| 328 |
# # current_max_len = embs.shape[1] + max_new_tokens + 100
|
| 329 |
# # begin_idx = max(0, current_max_len - max_length)
|
| 330 |
# # embs = embs[:, begin_idx:]
|
|
|
|
| 494 |
# if len(image.shape) == 3:
|
| 495 |
# image = image.unsqueeze(0)
|
| 496 |
# # image = image.to(self.device)
|
| 497 |
+
#
|
| 498 |
# # image_emb, _ = self.model.encode_img(image)
|
| 499 |
# img_list.append(image_emb)
|
| 500 |
# conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
|
|
|
|
| 587 |
|
| 588 |
|
| 589 |
|
|
|
multimodal/open_flamingo/eval/task/caption_chat.py
CHANGED
|
@@ -51,7 +51,8 @@ def prepare_batch_images(batch, image_processor):
|
|
| 51 |
|
| 52 |
|
| 53 |
def captioner(
|
| 54 |
-
|
|
|
|
| 55 |
"""Evaluate a model on COCO dataset.
|
| 56 |
Returns:
|
| 57 |
float: CIDEr score
|
|
@@ -80,7 +81,6 @@ def captioner(
|
|
| 80 |
input_ids = input_ids
|
| 81 |
attention_mask = attention_mask
|
| 82 |
else:
|
| 83 |
-
|
| 84 |
encodings = tokenizer(
|
| 85 |
[prompt],
|
| 86 |
padding="longest",
|
|
@@ -93,7 +93,7 @@ def captioner(
|
|
| 93 |
image_start_index_list = image_start_index_list
|
| 94 |
image_nums = image_nums
|
| 95 |
if debug:
|
| 96 |
-
print("input--->",tokenizer.decode(input_ids[0]))
|
| 97 |
p1 = MinNewTokensLengthLogitsProcessor(
|
| 98 |
prompt_length_to_skip=input_ids.shape[-1],
|
| 99 |
min_new_tokens=5,
|
|
@@ -114,7 +114,7 @@ def captioner(
|
|
| 114 |
logits_processor_list=[p1, visual_logits_processor],
|
| 115 |
)
|
| 116 |
if debug:
|
| 117 |
-
print("outputs--->",tokenizer.decode(outputs[0]))
|
| 118 |
if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
|
| 119 |
prompt = tokenizer.decode(outputs.clone()[0])
|
| 120 |
is_visual = (outputs[0, -2] == visual_token_id)
|
|
@@ -132,7 +132,7 @@ def captioner(
|
|
| 132 |
image_start_index_list = [[x] for x in image_start_index_list]
|
| 133 |
image_nums = [1] * len(input_ids)
|
| 134 |
if debug:
|
| 135 |
-
print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
|
| 136 |
with torch.no_grad():
|
| 137 |
outputs = model(
|
| 138 |
vision_x=batch_images,
|
|
@@ -145,6 +145,8 @@ def captioner(
|
|
| 145 |
)
|
| 146 |
boxes = outputs["boxes"]
|
| 147 |
scores = outputs["scores"]
|
|
|
|
|
|
|
| 148 |
# if not model.valid:
|
| 149 |
# import pdb; pdb.set_trace()
|
| 150 |
if boxes is not None:
|
|
@@ -168,7 +170,8 @@ def captioner(
|
|
| 168 |
open_cv_image = np.array(image_ori)
|
| 169 |
open_cv_image = open_cv_image[:, :, ::-1].copy()
|
| 170 |
for i, pre_box in enumerate(boxes):
|
| 171 |
-
open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int),
|
|
|
|
| 172 |
out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
|
| 173 |
# exit()
|
| 174 |
pre_box = boxes[scores.argmax()]
|
|
@@ -181,7 +184,14 @@ def captioner(
|
|
| 181 |
else:
|
| 182 |
# if debug:
|
| 183 |
# import pdb;pdb.set_trace()
|
|
|
|
|
|
|
|
|
|
| 184 |
prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
else:
|
| 186 |
break
|
| 187 |
outputs = outputs[:, ori_prompt_length:]
|
|
@@ -190,7 +200,8 @@ def captioner(
|
|
| 190 |
# postprocess_captioning_generation(out).replace('"', "")
|
| 191 |
# for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
| 192 |
# ]
|
| 193 |
-
|
|
|
|
| 194 |
return outputs, out_image
|
| 195 |
|
| 196 |
|
|
@@ -428,5 +439,4 @@ def evaluate_coco_flickr(
|
|
| 428 |
metrics = {}
|
| 429 |
metrics["CIDEr"] = 0.0
|
| 430 |
|
| 431 |
-
|
| 432 |
return metrics["CIDEr"]
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def captioner(
|
| 54 |
+
model, tokenizer, image_ori, batch_images, input_ids, attention_mask, image_start_index_list, image_nums,
|
| 55 |
+
added_bbox_list, debug=True):
|
| 56 |
"""Evaluate a model on COCO dataset.
|
| 57 |
Returns:
|
| 58 |
float: CIDEr score
|
|
|
|
| 81 |
input_ids = input_ids
|
| 82 |
attention_mask = attention_mask
|
| 83 |
else:
|
|
|
|
| 84 |
encodings = tokenizer(
|
| 85 |
[prompt],
|
| 86 |
padding="longest",
|
|
|
|
| 93 |
image_start_index_list = image_start_index_list
|
| 94 |
image_nums = image_nums
|
| 95 |
if debug:
|
| 96 |
+
print("input--->", tokenizer.decode(input_ids[0]))
|
| 97 |
p1 = MinNewTokensLengthLogitsProcessor(
|
| 98 |
prompt_length_to_skip=input_ids.shape[-1],
|
| 99 |
min_new_tokens=5,
|
|
|
|
| 114 |
logits_processor_list=[p1, visual_logits_processor],
|
| 115 |
)
|
| 116 |
if debug:
|
| 117 |
+
print("outputs--->", tokenizer.decode(outputs[0]))
|
| 118 |
if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
|
| 119 |
prompt = tokenizer.decode(outputs.clone()[0])
|
| 120 |
is_visual = (outputs[0, -2] == visual_token_id)
|
|
|
|
| 132 |
image_start_index_list = [[x] for x in image_start_index_list]
|
| 133 |
image_nums = [1] * len(input_ids)
|
| 134 |
if debug:
|
| 135 |
+
print("get the visual bbox--->", tokenizer.decode(input_ids[0]))
|
| 136 |
with torch.no_grad():
|
| 137 |
outputs = model(
|
| 138 |
vision_x=batch_images,
|
|
|
|
| 145 |
)
|
| 146 |
boxes = outputs["boxes"]
|
| 147 |
scores = outputs["scores"]
|
| 148 |
+
if debug:
|
| 149 |
+
print("box num---->", len(boxes))
|
| 150 |
# if not model.valid:
|
| 151 |
# import pdb; pdb.set_trace()
|
| 152 |
if boxes is not None:
|
|
|
|
| 170 |
open_cv_image = np.array(image_ori)
|
| 171 |
open_cv_image = open_cv_image[:, :, ::-1].copy()
|
| 172 |
for i, pre_box in enumerate(boxes):
|
| 173 |
+
open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int),
|
| 174 |
+
(0, 255, 0), i + 1)
|
| 175 |
out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
|
| 176 |
# exit()
|
| 177 |
pre_box = boxes[scores.argmax()]
|
|
|
|
| 184 |
else:
|
| 185 |
# if debug:
|
| 186 |
# import pdb;pdb.set_trace()
|
| 187 |
+
prompt = tokenizer.decode(outputs.clone()[0])
|
| 188 |
+
if debug:
|
| 189 |
+
print("before else---->", prompt)
|
| 190 |
prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
|
| 191 |
+
if debug:
|
| 192 |
+
print("after else---->", prompt)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
else:
|
| 196 |
break
|
| 197 |
outputs = outputs[:, ori_prompt_length:]
|
|
|
|
| 200 |
# postprocess_captioning_generation(out).replace('"', "")
|
| 201 |
# for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
| 202 |
# ]
|
| 203 |
+
# import pdb; pdb.set_trace()
|
| 204 |
+
print("out----------------------------------------------------------------------------------------->")
|
| 205 |
return outputs, out_image
|
| 206 |
|
| 207 |
|
|
|
|
| 439 |
metrics = {}
|
| 440 |
metrics["CIDEr"] = 0.0
|
| 441 |
|
|
|
|
| 442 |
return metrics["CIDEr"]
|