Spaces:

CVPR
/

unicl-zero-shot-img-recog

Build error

App Files Files Community

jwyang commited on Jun 15, 2022

Commit

eb1d5d5

1 Parent(s): 8424dda

support arbitary size

Browse files

Files changed (15) hide show

app.py +3 -3
model/__pycache__/__init__.cpython-39.pyc +0 -0
model/__pycache__/model.cpython-39.pyc +0 -0
model/__pycache__/templates.cpython-39.pyc +0 -0
model/image_encoder/__pycache__/__init__.cpython-39.pyc +0 -0
model/image_encoder/__pycache__/build.cpython-39.pyc +0 -0
model/image_encoder/__pycache__/focalnet.cpython-39.pyc +0 -0
model/image_encoder/__pycache__/swin_transformer.cpython-39.pyc +0 -0
model/image_encoder/swin_transformer.py +71 -25
model/model.py +2 -2
model/text_encoder/__pycache__/__init__.cpython-39.pyc +0 -0
model/text_encoder/__pycache__/build.cpython-39.pyc +0 -0
model/text_encoder/__pycache__/hf_model.cpython-39.pyc +0 -0
model/text_encoder/__pycache__/registry.cpython-39.pyc +0 -0
model/text_encoder/__pycache__/transformer.cpython-39.pyc +0 -0

app.py CHANGED Viewed

@@ -118,13 +118,13 @@ def recognize_image(image, texts):
     text_embeddings = model.get_text_embeddings(texts.split(';'))
     # compute output
-    feat_img, feat_map = model.encode_image(img_t.unsqueeze(0), output_map=True)
     output = model.logit_scale.exp() * feat_img @ text_embeddings.t()
     prediction = output.softmax(-1).flatten()
     # generate feat map given the top matched texts
     output_map = (feat_map * text_embeddings[prediction.argmax()].unsqueeze(-1)).sum(1).softmax(-1)
-    output_map = output_map.view(1, 1, 7, 7)
     output_map = nn.Upsample(size=img_t.shape[1:], mode='bilinear')(output_map)
     output_map = output_map.squeeze(1).detach().permute(1, 2, 0).numpy()
@@ -142,10 +142,10 @@ gr.Interface(
     fn=recognize_image,
     inputs=["image", "text"],
     outputs=[
-        label,
         gr.outputs.Image(
         type="pil",
         label="zero-shot heat map"),
     ],
     examples=[
     ["./elephants.png", "an elephant; an elephant walking in the river; four elephants walking in the river"],

     text_embeddings = model.get_text_embeddings(texts.split(';'))
     # compute output
+    feat_img, feat_map, H, W = model.encode_image(img_t.unsqueeze(0), output_map=True)
     output = model.logit_scale.exp() * feat_img @ text_embeddings.t()
     prediction = output.softmax(-1).flatten()
     # generate feat map given the top matched texts
     output_map = (feat_map * text_embeddings[prediction.argmax()].unsqueeze(-1)).sum(1).softmax(-1)
+    output_map = output_map.view(1, 1, H, W)
     output_map = nn.Upsample(size=img_t.shape[1:], mode='bilinear')(output_map)
     output_map = output_map.squeeze(1).detach().permute(1, 2, 0).numpy()
     fn=recognize_image,
     inputs=["image", "text"],
     outputs=[
         gr.outputs.Image(
         type="pil",
         label="zero-shot heat map"),
+        label
     ],
     examples=[
     ["./elephants.png", "an elephant; an elephant walking in the river; four elephants walking in the river"],

model/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/model/__pycache__/__init__.cpython-39.pyc and b/model/__pycache__/__init__.cpython-39.pyc differ

model/__pycache__/model.cpython-39.pyc CHANGED Viewed

Binary files a/model/__pycache__/model.cpython-39.pyc and b/model/__pycache__/model.cpython-39.pyc differ

model/__pycache__/templates.cpython-39.pyc CHANGED Viewed

Binary files a/model/__pycache__/templates.cpython-39.pyc and b/model/__pycache__/templates.cpython-39.pyc differ

model/image_encoder/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/model/image_encoder/__pycache__/__init__.cpython-39.pyc and b/model/image_encoder/__pycache__/__init__.cpython-39.pyc differ

model/image_encoder/__pycache__/build.cpython-39.pyc CHANGED Viewed

Binary files a/model/image_encoder/__pycache__/build.cpython-39.pyc and b/model/image_encoder/__pycache__/build.cpython-39.pyc differ

model/image_encoder/__pycache__/focalnet.cpython-39.pyc CHANGED Viewed

Binary files a/model/image_encoder/__pycache__/focalnet.cpython-39.pyc and b/model/image_encoder/__pycache__/focalnet.cpython-39.pyc differ

model/image_encoder/__pycache__/swin_transformer.cpython-39.pyc CHANGED Viewed

Binary files a/model/image_encoder/__pycache__/swin_transformer.cpython-39.pyc and b/model/image_encoder/__pycache__/swin_transformer.cpython-39.pyc differ

model/image_encoder/swin_transformer.py CHANGED Viewed

@@ -4,9 +4,10 @@
 # Licensed under The MIT License [see LICENSE for details]
 # Written by Ze Liu
 # --------------------------------------------------------
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
@@ -230,38 +231,51 @@ class SwinTransformerBlock(nn.Module):
         self.register_buffer("attn_mask", attn_mask)
-    def forward(self, x):
-        H, W = self.input_resolution
         B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
         shortcut = x
         x = self.norm1(x)
-        x = x.view(B, H, W, C)
         # cyclic shift
         if self.shift_size > 0:
             shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
         else:
             shifted_x = x
         # partition windows
         x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
         # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
         # merge windows
         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
         # reverse cyclic shift
         if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
         else:
             x = shifted_x
-        x = x.view(B, H * W, C)
         # FFN
         x = shortcut + self.drop_path(x)
@@ -304,16 +318,20 @@ class PatchMerging(nn.Module):
         self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
         self.norm = norm_layer(4 * dim)
-    def forward(self, x):
         """
         x: B, H*W, C
         """
-        H, W = self.input_resolution
         B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-        x = x.view(B, H, W, C)
         x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
         x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
@@ -366,6 +384,8 @@ class BasicLayer(nn.Module):
         self.input_resolution = input_resolution
         self.depth = depth
         self.use_checkpoint = use_checkpoint
         # build blocks
         self.blocks = nn.ModuleList([
@@ -385,15 +405,39 @@ class BasicLayer(nn.Module):
         else:
             self.downsample = None
-    def forward(self, x):
         for blk in self.blocks:
             if self.use_checkpoint:
                 x = checkpoint.checkpoint(blk, x)
             else:
-                x = blk(x)
         if self.downsample is not None:
-            x = self.downsample(x)
-        return x
     def extra_repr(self) -> str:
         return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
@@ -440,12 +484,14 @@ class PatchEmbed(nn.Module):
     def forward(self, x):
         B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
         if self.norm is not None:
             x = self.norm(x)
-        return x
     def flops(self):
         Ho, Wo = self.patches_resolution
@@ -558,20 +604,20 @@ class SwinTransformer(nn.Module):
         return {'relative_position_bias_table'}
     def forward_features(self, x, output_map=False):
-        x = self.patch_embed(x)
         if self.ape:
             x = x + self.absolute_pos_embed
         x = self.pos_drop(x)
         for layer in self.layers:
-            x = layer(x)
         x_map = self.norm(x).transpose(1, 2)  # B C L
         x = self.avgpool(x_map)  # B C 1
         x = torch.flatten(x, 1)
         if output_map:
-            return x, x_map
         else:
             return x

 # Licensed under The MIT License [see LICENSE for details]
 # Written by Ze Liu
 # --------------------------------------------------------
+import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
         self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x, Ph, Pw, attn_mask):
+        # H, W = self.input_resolution
         B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
         shortcut = x
         x = self.norm1(x)
+        x = x.view(B, Ph, Pw, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - Pw % self.window_size) % self.window_size
+        pad_b = (self.window_size - Ph % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
         # cyclic shift
         if self.shift_size > 0:
             shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = attn_mask
         else:
             shifted_x = x
+            attn_mask = None
         # partition windows
         x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
         x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
         # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
         # merge windows
         attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
         # reverse cyclic shift
         if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
         else:
             x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :Ph, :Pw, :].contiguous()
+        x = x.view(B, Ph * Pw, C)
         # FFN
         x = shortcut + self.drop_path(x)
         self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
         self.norm = norm_layer(4 * dim)
+    def forward(self, x, Ph, Pw):
         """
         x: B, H*W, C
         """
         B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+        # assert Ph % 2 == 0 and Pw % 2 == 0, f"x size ({Ph}*{Pw}) are not even."
+        x = x.view(B, Ph, Pw, C)
+        # padding
+        pad_input = (Ph % 2 == 1) or (Pw % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, Pw % 2, 0, Ph % 2))
         x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
         x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
         self.input_resolution = input_resolution
         self.depth = depth
         self.use_checkpoint = use_checkpoint
+        self.window_size = window_size
+        self.shift_size = window_size // 2
         # build blocks
         self.blocks = nn.ModuleList([
         else:
             self.downsample = None
+    def forward(self, x, Ph, Pw):
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(Ph / self.window_size)) * self.window_size
+        Wp = int(np.ceil(Pw / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
         for blk in self.blocks:
             if self.use_checkpoint:
                 x = checkpoint.checkpoint(blk, x)
             else:
+                x = blk(x, Ph, Pw, attn_mask)
         if self.downsample is not None:
+            x = self.downsample(x, Ph, Pw)
+            Ph, Pw = (Ph + 1) // 2, (Pw + 1) // 2
+        return x, Ph, Pw
     def extra_repr(self) -> str:
         return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
     def forward(self, x):
         B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Ph, Pw = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
         if self.norm is not None:
             x = self.norm(x)
+        return x, Ph, Pw
     def flops(self):
         Ho, Wo = self.patches_resolution
         return {'relative_position_bias_table'}
     def forward_features(self, x, output_map=False):
+        x, Ph, Pw = self.patch_embed(x)
         if self.ape:
             x = x + self.absolute_pos_embed
         x = self.pos_drop(x)
         for layer in self.layers:
+            x, Ph, Pw = layer(x, Ph, Pw)
         x_map = self.norm(x).transpose(1, 2)  # B C L
         x = self.avgpool(x_map)  # B C 1
         x = torch.flatten(x, 1)
         if output_map:
+            return x, x_map, Ph, Pw
         else:
             return x

model/model.py CHANGED Viewed

@@ -156,7 +156,7 @@ class UniCLModel(nn.Module):
     def encode_image(self, image, norm=True, output_map=False):
         x = self.image_encoder.forward_features(image, output_map=output_map)
         if output_map:
-            x, x_map = x
         x = x @ self.image_projection
@@ -169,7 +169,7 @@ class UniCLModel(nn.Module):
                 x_map = x_map / x_map.norm(dim=1, keepdim=True)
         if output_map:
-            return x, x_map
         else:
             return x

     def encode_image(self, image, norm=True, output_map=False):
         x = self.image_encoder.forward_features(image, output_map=output_map)
         if output_map:
+            x, x_map, H, W = x
         x = x @ self.image_projection
                 x_map = x_map / x_map.norm(dim=1, keepdim=True)
         if output_map:
+            return x, x_map, H, W
         else:
             return x

model/text_encoder/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/model/text_encoder/__pycache__/__init__.cpython-39.pyc and b/model/text_encoder/__pycache__/__init__.cpython-39.pyc differ

model/text_encoder/__pycache__/build.cpython-39.pyc CHANGED Viewed

Binary files a/model/text_encoder/__pycache__/build.cpython-39.pyc and b/model/text_encoder/__pycache__/build.cpython-39.pyc differ

model/text_encoder/__pycache__/hf_model.cpython-39.pyc CHANGED Viewed

Binary files a/model/text_encoder/__pycache__/hf_model.cpython-39.pyc and b/model/text_encoder/__pycache__/hf_model.cpython-39.pyc differ

model/text_encoder/__pycache__/registry.cpython-39.pyc CHANGED Viewed

Binary files a/model/text_encoder/__pycache__/registry.cpython-39.pyc and b/model/text_encoder/__pycache__/registry.cpython-39.pyc differ

model/text_encoder/__pycache__/transformer.cpython-39.pyc CHANGED Viewed

Binary files a/model/text_encoder/__pycache__/transformer.cpython-39.pyc and b/model/text_encoder/__pycache__/transformer.cpython-39.pyc differ