Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,92 +19,35 @@ import torch
|
|
| 19 |
import tqdm
|
| 20 |
from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
|
| 21 |
|
| 22 |
-
DESCRIPTION = "#
|
| 23 |
|
| 24 |
MAX_NUM_FRAMES = 300
|
| 25 |
|
| 26 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
pose_model_name = "usyd-community/vitpose-base-simple"
|
| 33 |
-
pose_image_processor = AutoProcessor.from_pretrained(pose_model_name)
|
| 34 |
-
pose_model = VitPoseForPoseEstimation.from_pretrained(pose_model_name, device_map=device)
|
| 35 |
|
| 36 |
|
| 37 |
@spaces.GPU(duration=5)
|
| 38 |
@torch.inference_mode()
|
| 39 |
def process_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dict]]:
|
| 40 |
-
inputs =
|
| 41 |
-
outputs =
|
| 42 |
results = person_image_processor.post_process_object_detection(
|
| 43 |
outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
|
| 44 |
)
|
| 45 |
result = results[0] # take first image results
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
person_boxes_xyxy = result["boxes"][result["labels"] == 0]
|
| 49 |
-
person_boxes_xyxy = person_boxes_xyxy.cpu().numpy()
|
| 50 |
-
|
| 51 |
-
# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
|
| 52 |
-
person_boxes = person_boxes_xyxy.copy()
|
| 53 |
-
person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
|
| 54 |
-
person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
|
| 55 |
-
|
| 56 |
-
inputs = pose_image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
|
| 57 |
-
|
| 58 |
-
# for vitpose-plus-base checkpoint we should additionally provide dataset_index
|
| 59 |
-
# to specify which MOE experts to use for inference
|
| 60 |
-
if pose_model.config.backbone_config.num_experts > 1:
|
| 61 |
-
dataset_index = torch.tensor([0] * len(inputs["pixel_values"]))
|
| 62 |
-
dataset_index = dataset_index.to(inputs["pixel_values"].device)
|
| 63 |
-
inputs["dataset_index"] = dataset_index
|
| 64 |
-
|
| 65 |
-
outputs = pose_model(**inputs)
|
| 66 |
-
|
| 67 |
-
pose_results = pose_image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
|
| 68 |
-
image_pose_result = pose_results[0] # results for first image
|
| 69 |
-
|
| 70 |
-
# make results more human-readable
|
| 71 |
-
human_readable_results = []
|
| 72 |
-
for i, person_pose in enumerate(image_pose_result):
|
| 73 |
-
data = {
|
| 74 |
-
"person_id": i,
|
| 75 |
-
"bbox": person_pose["bbox"].numpy().tolist(),
|
| 76 |
-
"keypoints": [],
|
| 77 |
-
}
|
| 78 |
-
for keypoint, label, score in zip(
|
| 79 |
-
person_pose["keypoints"], person_pose["labels"], person_pose["scores"], strict=True
|
| 80 |
-
):
|
| 81 |
-
keypoint_name = pose_model.config.id2label[label.item()]
|
| 82 |
-
x, y = keypoint
|
| 83 |
-
data["keypoints"].append({"name": keypoint_name, "x": x.item(), "y": y.item(), "score": score.item()})
|
| 84 |
-
human_readable_results.append(data)
|
| 85 |
-
|
| 86 |
-
# preprocess to torch tensor of shape (n_objects, n_keypoints, 2)
|
| 87 |
-
xy = [pose_result["keypoints"] for pose_result in image_pose_result]
|
| 88 |
-
xy = torch.stack(xy).cpu().numpy()
|
| 89 |
-
|
| 90 |
-
scores = [pose_result["scores"] for pose_result in image_pose_result]
|
| 91 |
-
scores = torch.stack(scores).cpu().numpy()
|
| 92 |
-
|
| 93 |
-
keypoints = sv.KeyPoints(xy=xy, confidence=scores)
|
| 94 |
-
detections = sv.Detections(xyxy=person_boxes_xyxy)
|
| 95 |
-
|
| 96 |
-
edge_annotator = sv.EdgeAnnotator(color=sv.Color.GREEN, thickness=1)
|
| 97 |
-
vertex_annotator = sv.VertexAnnotator(color=sv.Color.RED, radius=2)
|
| 98 |
bounding_box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=1)
|
| 99 |
|
| 100 |
-
annotated_frame = image.copy()
|
| 101 |
-
|
| 102 |
# annotate bounding boxes
|
| 103 |
annotated_frame = bounding_box_annotator.annotate(scene=image.copy(), detections=detections)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
annotated_frame = edge_annotator.annotate(scene=annotated_frame, key_points=keypoints)
|
| 107 |
-
return vertex_annotator.annotate(scene=annotated_frame, key_points=keypoints), human_readable_results
|
| 108 |
|
| 109 |
|
| 110 |
@spaces.GPU(duration=90)
|
|
|
|
| 19 |
import tqdm
|
| 20 |
from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
|
| 21 |
|
| 22 |
+
DESCRIPTION = "# DAB-DETR"
|
| 23 |
|
| 24 |
MAX_NUM_FRAMES = 300
|
| 25 |
|
| 26 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 27 |
|
| 28 |
+
checkpoint = "IDEA-Research/dab-detr-resnet-50-dc5-pat3"
|
| 29 |
+
image_processor = AutoProcessor.from_pretrained(person_detector_name)
|
| 30 |
+
model = RTDetrForObjectDetection.from_pretrained(person_detector_name, device_map=device)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
@spaces.GPU(duration=5)
|
| 34 |
@torch.inference_mode()
|
| 35 |
def process_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dict]]:
|
| 36 |
+
inputs = image_processor(images=image, return_tensors="pt").to(device)
|
| 37 |
+
outputs = model(**inputs)
|
| 38 |
results = person_image_processor.post_process_object_detection(
|
| 39 |
outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
|
| 40 |
)
|
| 41 |
result = results[0] # take first image results
|
| 42 |
+
boxes_xyxy = result["boxes"].cpu().numpy()
|
| 43 |
|
| 44 |
+
detections = sv.Detections(xyxy=boxes_xyxy)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
bounding_box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=1)
|
| 46 |
|
|
|
|
|
|
|
| 47 |
# annotate bounding boxes
|
| 48 |
annotated_frame = bounding_box_annotator.annotate(scene=image.copy(), detections=detections)
|
| 49 |
|
| 50 |
+
return annotated_frame
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
@spaces.GPU(duration=90)
|