seetrails_aigvdet_v2.0.0 / models /MINTIME

Kalpit

feat: Add model files with LFS

d39b279 2 months ago

8.82 kB

	"""
	MINTIME: Multi-Identity size-iNvariant TIMEsformer for Video Deepfake Detection@TIFS'2024
	Copyright (c) ISTI-CNR and its affiliates.
	Modified by Davide Alessandro Coccomini from https://github.com/davide-coccomini/MINTIME-Multi-Identity-size-iNvariant-TIMEsformer-for-Video-Deepfake-Detection
	"""

	import torch
	from torch import nn, einsum
	import torch.nn.functional as F
	from einops import rearrange, repeat
	from statistics import mean
	from torch.nn.init import trunc_normal_
	import cv2
	import numpy as np
	from random import random
	from .clip import clip
	from einops.layers.torch import Rearrange

	# helpers
	def exists(val):
	return val is not None

	# classes
	class PreNorm(nn.Module):
	def __init__(self, dim, fn):
	super().__init__()
	self.fn = fn
	self.norm = nn.LayerNorm(dim)

	def forward(self, x, args, *kwargs):
	x = self.norm(x)
	return self.fn(x, args, *kwargs)

	# time token shift
	def shift(t, amt):
	if amt == 0:
	return t
	return F.pad(t, (0, 0, 0, 0, amt, -amt))

	class PreTokenShift(nn.Module):
	def __init__(self, frames, fn):
	super().__init__()
	self.frames = frames
	self.fn = fn

	def forward(self, x, args, *kwargs):
	f, dim = self.frames, x.shape[-1]
	cls_x, x = x[:, :1], x[:, 1:]
	x = rearrange(x, 'b (f n) d -> b f n d', f = f)

	# shift along time frame before and after
	dim_chunk = (dim // 3)
	chunks = x.split(dim_chunk, dim = -1)
	chunks_to_shift, rest = chunks[:3], chunks[3:]
	shifted_chunks = tuple(map(lambda args: shift(*args), zip(chunks_to_shift, (-1, 0, 1))))
	x = torch.cat((shifted_chunks, rest), dim = -1)

	x = rearrange(x, 'b f n d -> b (f n) d')
	x = torch.cat((cls_x, x), dim = 1)
	return self.fn(x, args, *kwargs)

	# feedforward
	class GEGLU(nn.Module):
	def forward(self, x):
	x, gates = x.chunk(2, dim = -1)
	return x * F.gelu(gates)

	class FeedForward(nn.Module):
	def __init__(self, dim, mult = 4, dropout = 0.):
	super().__init__()
	self.net = nn.Sequential(
	nn.Linear(dim, dim * mult * 2),
	GEGLU(),
	nn.Dropout(dropout),
	nn.Linear(dim * mult, dim)
	)

	def forward(self, x):
	return self.net(x)

	# attention
	def attn(q, k, v):
	sim = torch.einsum('b i d, b j d -> b i j', q, k)
	attn = sim.softmax(dim = -1)
	out = torch.einsum('b i j, b j d -> b i d', attn, v)
	return out, attn

	class Attention(nn.Module):
	def __init__(self, dim, dim_head = 64, heads = 8, dropout = 0.):
	super().__init__()
	self.heads = heads
	self.scale = dim_head ** -0.5
	inner_dim = dim_head * heads

	self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
	self.to_out = nn.Sequential(
	nn.Linear(inner_dim, dim),
	nn.Dropout(dropout)
	)

	def forward(self, x, einops_from, einops_to, **einops_dims):
	h = self.heads
	q, k, v = self.to_qkv(x).chunk(3, dim = -1)
	q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), (q, k, v))

	q = q * self.scale

	# splice out classification token at index 1
	(cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, :1], t[:, 1:]), (q, k, v))

	# let classification token attend to key / values of all patches across time and space
	cls_out, cls_attentions = attn(cls_q, k, v)

	# rearrange across time or space
	q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims), (q_, k_, v_))

	# expand cls token keys and values across time or space and concat
	r = q_.shape[0] // cls_k.shape[0]
	cls_k, cls_v = map(lambda t: repeat(t, 'b () d -> (b r) () d', r = r), (cls_k, cls_v))

	k_ = torch.cat((cls_k, k_), dim = 1)
	v_ = torch.cat((cls_v, v_), dim = 1)

	# attention
	out, attentions = attn(q_, k_, v_)

	# merge back time or space
	out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims)

	# concat back the cls token
	out = torch.cat((cls_out, out), dim = 1)

	# merge back the heads
	out = rearrange(out, '(b h) n d -> b n (h d)', h = h)

	# combine heads out
	return self.to_out(out), cls_attentions

	class SizeInvariantTimeSformer(nn.Module):
	def __init__(
	self,
	*,
	require_attention = False
	):
	super().__init__()
	self.dim = 512
	self.num_frames = 8
	self.max_identities = 1
	self.image_size = 224
	self.num_classes = 1
	self.patch_size = 1
	self.num_patches = 196
	self.channels = 512
	self.depth = 9
	self.heads = 8
	self.dim_head = 64
	self.attn_dropout = 0.
	self.ff_dropout = 0.
	self.shift_tokens = False
	self.enable_size_emb = True
	self.enable_pos_emb = True
	self.require_attention = require_attention

	num_positions = self.num_frames * self.channels
	self.to_patch_embedding = nn.Linear(self.channels, self.dim)
	self.cls_token = nn.Parameter(torch.randn(1, self.dim))
	self.pos_emb = nn.Embedding(num_positions + 1, self.dim)

	if self.enable_size_emb:
	self.size_emb = nn.Embedding(num_positions + 1, self.dim)

	self.layers = nn.ModuleList([])
	for _ in range(self.depth):
	ff = FeedForward(self.dim, dropout = self.ff_dropout)
	time_attn = Attention(self.dim, dim_head = self.dim_head, heads = self.heads, dropout = self.attn_dropout)
	spatial_attn = Attention(self.dim, dim_head = self.dim_head, heads = self.heads, dropout = self.attn_dropout)
	if self.shift_tokens:
	time_attn, spatial_attn, ff = map(lambda t: PreTokenShift(self.num_frames, t), (time_attn, spatial_attn, ff))

	time_attn, spatial_attn, ff = map(lambda t: PreNorm(self.dim, t), (time_attn, spatial_attn, ff))
	self.layers.append(nn.ModuleList([time_attn, spatial_attn, ff]))

	self.to_out = nn.Sequential(
	nn.LayerNorm(self.dim),
	nn.Linear(self.dim, self.num_classes)
	)

	# Initialization
	trunc_normal_(self.pos_emb.weight, std=.02)
	trunc_normal_(self.cls_token, std=.02)
	if self.enable_size_emb:
	trunc_normal_(self.size_emb.weight, std=.02)
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	@torch.jit.ignore
	def no_weight_decay(self):
	return {'pos_emb', 'cls_token'}

	def forward(self, x):
	b, f, c, h, w = x.shape
	n = h * w
	device = x.device

	x = rearrange(x, 'b f c h w -> b (f h w) c') # B x FPP x C
	tokens = self.to_patch_embedding(x) # B x 877 x dim

	# Add cls token
	cls_token = repeat(self.cls_token, 'n d -> b n d', b = b)
	x = torch.cat((cls_token, tokens), dim = 1)

	# Positional embedding
	x += self.pos_emb(torch.arange(x.shape[1], device=device))

	# Time and space attention
	for (time_attn, spatial_attn, ff) in self.layers:
	y, _ = time_attn(x, 'b (f n) d', '(b n) f d', n = n)
	x = x + y
	y, _ = spatial_attn(x, 'b (f n) d', '(b f) n d', f = f)
	x = x + y
	x = ff(x) + x

	cls_token = x[:, 0]

	if self.require_attention:
	return self.to_out(cls_token)
	else:
	return self.to_out(cls_token)


	class ViT_B_MINTIME(nn.Module):
	def __init__(
	self, channel_size=512, class_num=1
	):
	super(ViT_B_MINTIME, self).__init__()
	self.clip_model, preprocess = clip.load('ViT-B-16')
	self.clip_model = self.clip_model.float()
	self.head = SizeInvariantTimeSformer()

	def forward(self, x):
	b, t, _, h, w = x.shape
	images = x.view(b * t, 3, h, w)
	sequence_output = self.clip_model.encode_image(images)
	_, _, c = sequence_output.shape
	sequence_output = sequence_output.view(b, t, 14, 14, c)
	sequence_output = sequence_output.permute(0, 1, 4, 2, 3)


	res = self.head(sequence_output)

	return res


	if __name__ == '__main__':

	model = ViT_B_MINTIME()
	model = model.cuda()
	dummy_input = torch.randn(4,8,3,224,224)
	dummy_input = dummy_input.cuda()
	print(model(dummy_input))