JointTaggerProject-Inference-Beta-AttnVis

Sleeping

App Files Files Community

drhead commited on May 1

Commit

de116ae

verified ·

1 Parent(s): 0e775d1

attempt to make tag vis work

Browse files

Files changed (1) hide show

app.py +126 -5

app.py CHANGED Viewed

@@ -12,8 +12,6 @@ from torchvision.transforms import InterpolationMode
 import torchvision.transforms.functional as TF
 from huggingface_hub import hf_hub_download
-torch.set_grad_enabled(False)
 class Fit(torch.nn.Module):
     def __init__(
         self,
@@ -155,11 +153,14 @@ for idx, tag in enumerate(allowed_tags):
     allowed_tags[idx] = tag.replace("_", " ")
 sorted_tag_score = {}
 @spaces.GPU(duration=5)
 def run_classifier(image, threshold):
-    global sorted_tag_score
-    img = image.convert('RGBA')
     tensor = transform(img).unsqueeze(0)
     with torch.no_grad():
@@ -180,10 +181,124 @@ def create_tags(threshold):
     return text_no_impl, filtered_tag_score
 def clear_image():
-    global sorted_tag_score
     sorted_tag_score = {}
     return "", {}
 with gr.Blocks(css=".output-class { display: none; }") as demo:
     gr.Markdown("""
     ## Joint Tagger Project: JTP-PILOT² Demo **BETA**
@@ -219,5 +334,11 @@ with gr.Blocks(css=".output-class { display: none; }") as demo:
         outputs=[tag_string, label_box]
     )
 if __name__ == "__main__":
     demo.launch()

 import torchvision.transforms.functional as TF
 from huggingface_hub import hf_hub_download
 class Fit(torch.nn.Module):
     def __init__(
         self,
     allowed_tags[idx] = tag.replace("_", " ")
 sorted_tag_score = {}
+input_image = None
 @spaces.GPU(duration=5)
 def run_classifier(image, threshold):
+    global sorted_tag_score, input_image
+    input_image = image.convert('RGBA')
+    img = input_image
     tensor = transform(img).unsqueeze(0)
     with torch.no_grad():
     return text_no_impl, filtered_tag_score
 def clear_image():
+    global sorted_tag_score, input_image
+    input_image = None
     sorted_tag_score = {}
     return "", {}
+target_tag_index = None
+# Store hooks and intermediate values
+gradients = {}
+activations = {}
+def hook_forward(module, input, output):
+    activations['value'] = output
+def hook_backward(module, grad_in, grad_out):
+    gradients['value'] = grad_out[0]
+def cam_inference(target_tag, threshold):
+    global input_image, sorted_tag_score, target_tag_index, gradients, activations
+    img = input_image
+    tensor = transform(img).unsqueeze(0)
+    gradients = {}
+    activations = {}
+    cam = None
+    target_tag_index = None
+    if target_tag:
+        if target_tag not in allowed_tags:
+            print(f"Warning: Target tag '{target_tag}' not found in allowed tags.")
+            target_tag = None
+        else:
+            target_tag_index = allowed_tags.index(target_tag)
+            handle_forward = model.norm.register_forward_hook(hook_forward)
+            handle_backward = model.norm.register_full_backward_hook(hook_backward)
+    probits = model(tensor)[0].cpu()
+    if target_tag is not None and target_tag_index is not None:
+        model.zero_grad()
+        target_score = probits[target_tag_index]
+        target_score.backward(retain_graph=True)
+        grads = gradients.get('value')
+        acts = activations.get('value')
+        if grads is not None and acts is not None:
+            patch_grads = grads
+            patch_acts = acts
+            weights = torch.mean(patch_grads, dim=1).squeeze(0)
+            cam_1d = torch.einsum('pe,e->p', patch_acts.squeeze(0), weights)
+            cam_1d = torch.relu(cam_1d)
+            cam = cam_1d.reshape(27, 27).detach().cpu().numpy()
+        handle_forward.remove()
+        handle_backward.remove()
+        gradients = {}
+        activations = {}
+    return create_cam_visualization_pil(cam, vis_threshold=threshold)
+def create_cam_visualization_pil(cam, alpha=0.6, vis_threshold=0.2):
+    """
+    Overlays CAM on image and returns a PIL image.
+    Args:
+        image_pil: PIL Image (RGB)
+        cam: 2D numpy array (activation map)
+        alpha: float, blending factor
+        vis_threshold: float, minimum normalized CAM value to show color
+    Returns:
+        PIL.Image.Image with overlay
+    """
+    if cam is None:
+        print("CAM is None, skipping visualization.")
+        return image_pil
+    global input_image
+    # Convert to RGB (in case RGBA or others)
+    image_pil = input_image
+    w, h = image_pil.size
+    # Resize CAM to match image
+    cam_resized = np.array(Image.fromarray(cam).resize((w, h), resample=Image.BILINEAR))
+    # Normalize CAM to [0, 1]
+    cam_norm = (cam_resized - cam_resized.min()) / (cam_resized.ptp() + 1e-8)
+    # Apply threshold mask
+    mask = cam_norm >= vis_threshold
+    # Create heatmap using matplotlib colormap
+    colormap = cm.get_cmap('jet')
+    heatmap_rgba = colormap(cam_norm)  # shape: (H, W, 4), values in [0, 1]
+    heatmap_rgb = (heatmap_rgba[:, :, :3] * 255).astype(np.uint8)
+    # Convert heatmap to PIL image
+    heatmap_pil = Image.fromarray(heatmap_rgb).convert("RGB")
+    # Convert images to NumPy for blending
+    base_np = np.array(image_pil).astype(np.float32)
+    heat_np = np.array(heatmap_pil).astype(np.float32)
+    # Blend only where mask is True
+    blended_np = base_np.copy()
+    blended_np[mask] = base_np[mask] * (1 - alpha) + heat_np[mask] * alpha
+    blended_np = np.clip(blended_np, 0, 255).astype(np.uint8)
+    # Convert back to PIL image
+    blended_img = Image.fromarray(blended_np)
+    return blended_img
 with gr.Blocks(css=".output-class { display: none; }") as demo:
     gr.Markdown("""
     ## Joint Tagger Project: JTP-PILOT² Demo **BETA**
         outputs=[tag_string, label_box]
     )
+    label_box.select(
+        fn=cam_inference,
+        inputs=[threshold_slider],
+        outputs=[image_input]
+    )
 if __name__ == "__main__":
     demo.launch()