Spaces:

CVPR
/

unicl-zero-shot-img-recog

Build error

jwyang commited on Jun 15, 2022

Commit

ad7aaa6

1 Parent(s): a574e10

add heatmap visualization

Files changed (3) hide show

app.py CHANGED Viewed

@@ -118,11 +118,20 @@ def recognize_image(image, texts):
     text_embeddings = model.get_text_embeddings(texts.split(';'))
     # compute output
-    feat_img = model.encode_image(img_t.unsqueeze(0))
     output = model.logit_scale.exp() * feat_img @ text_embeddings.t()
     prediction = output.softmax(-1).flatten()
-    return {texts.split(';')[i]: float(prediction[i]) for i in range(len(texts.split(';')))}
 image = gr.inputs.Image()
@@ -132,8 +141,11 @@ gr.Interface(
     description="UniCL for Zero-shot Image Recognition Demo (https://github.com/microsoft/unicl)",
     fn=recognize_image,
     inputs=["image", "text"],
-    outputs=[
-        label,
     ],
     examples=[
     ["./elephants.png", "an elephant; an elephant walking in the river; four elephants walking in the river"],

     text_embeddings = model.get_text_embeddings(texts.split(';'))
     # compute output
+    feat_img, feat_map = model.encode_image(img_t.unsqueeze(0), output_map=True)
     output = model.logit_scale.exp() * feat_img @ text_embeddings.t()
     prediction = output.softmax(-1).flatten()
+    # generate feat map given the top matched texts
+    output_map = (feat_map * text_embeddings[prediction.argmax()].unsqueeze(-1)).sum(1).softmax(-1)
+    output_map = output_map.view(1, 1, 7, 7)
+    output_map = nn.Upsample(size=img_t.shape[1:], mode='bilinear')(output_map)
+    output_map = output_map.squeeze(1).detach().permute(1, 2, 0).numpy()
+    output_map = (output_map - output_map.min()) / (output_map.max() - output_map.min())
+    heatmap = show_cam_on_image(img_d, output_map, use_rgb=True)
+    return Image.fromarray(heatmap), {texts.split(';')[i]: float(prediction[i]) for i in range(len(texts.split(';')))}
 image = gr.inputs.Image()
     description="UniCL for Zero-shot Image Recognition Demo (https://github.com/microsoft/unicl)",
     fn=recognize_image,
     inputs=["image", "text"],
+    outputs=[
+        gr.outputs.Image(
+        type="pil",
+        label="zero-shot heat map"),
+        label,
     ],
     examples=[
     ["./elephants.png", "an elephant; an elephant walking in the river; four elephants walking in the river"],

model/image_encoder/swin_transformer.py CHANGED Viewed

@@ -557,7 +557,7 @@ class SwinTransformer(nn.Module):
     def no_weight_decay_keywords(self):
         return {'relative_position_bias_table'}
-    def forward_features(self, x):
         x = self.patch_embed(x)
         if self.ape:
             x = x + self.absolute_pos_embed
@@ -566,10 +566,14 @@ class SwinTransformer(nn.Module):
         for layer in self.layers:
             x = layer(x)
-        x = self.norm(x)  # B L C
-        x = self.avgpool(x.transpose(1, 2))  # B C 1
         x = torch.flatten(x, 1)
-        return x
     def forward(self, x):
         x = self.forward_features(x)

     def no_weight_decay_keywords(self):
         return {'relative_position_bias_table'}
+    def forward_features(self, x, output_map=False):
         x = self.patch_embed(x)
         if self.ape:
             x = x + self.absolute_pos_embed
         for layer in self.layers:
             x = layer(x)
+        x_map = self.norm(x).transpose(1, 2)  # B C L
+        x = self.avgpool(x_map)  # B C 1
         x = torch.flatten(x, 1)
+        if output_map:
+            return x, x_map
+        else:
+            return x
     def forward(self, x):
         x = self.forward_features(x)

model/model.py CHANGED Viewed

@@ -153,14 +153,25 @@ class UniCLModel(nn.Module):
         imnet_text_embeddings = torch.stack(clss_embeddings, dim=0)
         return imnet_text_embeddings
-    def encode_image(self, image, norm=True):
-        x = self.image_encoder.forward_features(image)
         x = x @ self.image_projection
         if norm:
             x = x / x.norm(dim=-1, keepdim=True)
-        return x
     def encode_text(self, text, norm=True):
         x = self.text_encoder(**text)

         imnet_text_embeddings = torch.stack(clss_embeddings, dim=0)
         return imnet_text_embeddings
+    def encode_image(self, image, norm=True, output_map=False):
+        x = self.image_encoder.forward_features(image, output_map=output_map)
+        if output_map:
+            x, x_map = x
         x = x @ self.image_projection
+        if output_map:
+            x_map = self.image_projection.unsqueeze(0).transpose(1, 2) @  x_map
         if norm:
             x = x / x.norm(dim=-1, keepdim=True)
+            if output_map:
+                x_map = x_map / x_map.norm(dim=1, keepdim=True)
+        if output_map:
+            return x, x_map
+        else:
+            return x
     def encode_text(self, text, norm=True):
         x = self.text_encoder(**text)