File size: 6,550 Bytes
d39b279 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
from transformers import XCLIPVisionModel
import os
import sys
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from .mamba_base import MambaConfig, ResidualBlock
import torch.nn.init as init
from .clip import clip
import math
from transformers import XCLIPVisionConfig, XCLIPVisionModel
def create_reorder_index(N, device):
new_order = []
for col in range(N):
if col % 2 == 0:
new_order.extend(range(col, N*N, N))
else:
new_order.extend(range(col + N*(N-1), col-1, -N))
return torch.tensor(new_order, device=device)
def reorder_data(data, N):
assert isinstance(data, torch.Tensor), "data should be a torch.Tensor"
device = data.device
new_order = create_reorder_index(N, device)
B, t, _, _ = data.shape
index = new_order.repeat(B, t, 1).unsqueeze(-1)
reordered_data = torch.gather(data, 2, index.expand_as(data))
return reordered_data
class XCLIP_DeMamba(nn.Module):
def __init__(
self, channel_size=768, class_num=1
):
super(XCLIP_DeMamba, self).__init__()
# self.encoder = XCLIPVisionModel.from_pretrained("GenVideo/pretrained_weights/xclip")
# my code for training from scratch
config = XCLIPVisionConfig()
self.encoder = XCLIPVisionModel(config)
blocks = []
channel = 768
self.fusing_ratios = 1
self.patch_nums = (14//self.fusing_ratios)**2
self.mamba_configs = MambaConfig(d_model=channel)
self.mamba = ResidualBlock(config = self.mamba_configs)
# self.fc1 = nn.Linear((self.patch_nums+1)*channel, class_num)
self.fc1 = nn.Linear(38400, class_num) # my code
# self.fc_norm = nn.LayerNorm(self.patch_nums*channel)
self.fc_norm = None # my code
self.fc_norm2 = nn.LayerNorm(768)
self.initialize_weights(self.fc1)
self.dropout = nn.Dropout(p=0.0)
def initialize_weights(self, module):
for m in module.modules():
if isinstance(m, nn.Linear):
init.xavier_uniform_(m.weight)
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.Conv2d):
init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
def forward(self, x):
b, t, _, h, w = x.shape
images = x.view(b * t, 3, h, w)
outputs = self.encoder(images, output_hidden_states=True)
sequence_output = outputs['last_hidden_state'][:,1:,:]
_, _, c = sequence_output.shape
global_feat = outputs['pooler_output'].reshape(b, t, -1)
global_feat = global_feat.mean(1)
global_feat = self.fc_norm2(global_feat)
sequence_output = sequence_output.view(b, t, -1, c)
_, _, f_w, _ = sequence_output.shape
f_h, f_w = int(math.sqrt(f_w)), int(math.sqrt(f_w))
s = f_h//self.fusing_ratios
sequence_output = sequence_output.view(b, t, self.fusing_ratios, s, self.fusing_ratios, s, c)
x = sequence_output.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(b*s*s, t, -1, c)
b_l = b*s*s
x = reorder_data(x, self.fusing_ratios)
x = x.permute(0, 2, 1, 3).contiguous().view(b_l, -1, c)
res = self.mamba(x)
video_level_features = res.mean(1)
video_level_features = video_level_features.view(b, -1)
# my code
if self.fc_norm is None:
self.fc_norm = nn.LayerNorm(video_level_features.size(-1)).to(video_level_features.device)
video_level_features = self.fc_norm(video_level_features)
video_level_features = torch.cat((global_feat, video_level_features), dim=1)
pred = self.fc1(video_level_features)
pred = self.dropout(pred)
return pred
class CLIP_DeMamba(nn.Module):
def __init__(
self, channel_size=512, class_num=1
):
super(CLIP_DeMamba, self).__init__()
self.clip_model, preprocess = clip.load('ViT-B-14')
self.clip_model = self.clip_model.float()
blocks = []
channel = 512
self.fusing_ratios = 2
self.patch_nums = (14//self.fusing_ratios)**2
self.mamba_configs = MambaConfig(d_model=channel)
self.mamba = ResidualBlock(config = self.mamba_configs)
self.fc1 = nn.Linear(channel*(self.patch_nums+1), class_num)
self.bn1 = nn.BatchNorm1d(channel)
self.initialize_weights(self.fc1)
def initialize_weights(self, module):
for m in module.modules():
if isinstance(m, nn.Linear):
init.xavier_uniform_(m.weight)
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.Conv2d):
init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
def forward(self, x):
b, t, _, h, w = x.shape
images = x.view(b * t, 3, h, w)
sequence_output = self.clip_model.encode_image(images)
_, _, c = sequence_output.shape
sequence_output = sequence_output.view(b, t, -1, c)
global_feat = sequence_output.reshape(b, -1, c)
global_feat = global_feat.mean(1)
_, _, f_w, _ = sequence_output.shape
f_h, f_w = int(math.sqrt(f_w)), int(math.sqrt(f_w))
s = f_h//self.fusing_ratios
sequence_output = sequence_output.view(b, t, self.fusing_ratios, s, self.fusing_ratios, s, c)
x = sequence_output.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(b*s*s, t, -1, c)
b_l = b*s*s
x = reorder_data(x, self.fusing_ratios)
x = x.permute(0, 2, 1, 3).contiguous().view(b_l, -1, c)
res = self.mamba(x)
video_level_features = res.mean(1)
video_level_features = video_level_features.view(b, -1)
video_level_features = torch.cat((global_feat, video_level_features), dim=1)
x = self.fc1(video_level_features)
return x
if __name__ == '__main__':
model = CLIP_DeMamba()
print(model)
|