Spaces:

abreza
/

3d_animation_toolkit

Runtime error

App Files Files Community

abreza commited on May 11, 2024

Commit

4e54f49

1 Parent(s): 471609c

use mg-llava instead llava in AutoConfig.register

Browse files

Files changed (1) hide show

ml_mgie/mgie_llava.py +91 -47

ml_mgie/mgie_llava.py CHANGED Viewed

@@ -12,12 +12,12 @@ import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from transformers import AutoConfig, AutoModelForCausalLM, \
-                         LlamaConfig, LlamaModel, LlamaForCausalLM, \
-                         CLIPVisionModel, CLIPImageProcessor
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-import os, diffusers
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
@@ -26,7 +26,7 @@ DEFAULT_IM_END_TOKEN = "<im_end>"
 class LlavaConfig(LlamaConfig):
-    model_type = "llava"
 class LlavaLlamaModel(LlamaModel):
@@ -37,11 +37,13 @@ class LlavaLlamaModel(LlamaModel):
         if hasattr(config, "mm_vision_tower"):
             # HACK: for FSDP
-            self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
             # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)
         if hasattr(config, "use_mm_proj"):
-            self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
     def get_vision_tower(self):
         vision_tower = getattr(self, 'vision_tower', None)
@@ -67,18 +69,22 @@ class LlavaLlamaModel(LlamaModel):
             self.vision_tower = vision_tower
         vision_config = vision_tower.config
-        num_patches = (vision_config.image_size // vision_config.patch_size) ** 2
         self.config.use_mm_proj = True
         self.config.mm_hidden_size = vision_config.hidden_size
         self.config.mm_vision_select_layer = mm_vision_select_layer
         if not hasattr(self, 'mm_projector'):
-            self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.hidden_size)
         if pretrain_mm_mlp_adapter is not None:
-            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
-            self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items()})
         return dict(
             image_processor=image_processor,
@@ -117,21 +123,28 @@ class LlavaLlamaModel(LlamaModel):
                     # variable length images
                     image_features = []
                     for image in images:
-                        image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True)
-                        select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
                         select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
                         image_feature = select_hidden_state[:, 1:]
                         image_features.append(image_feature)
                 else:
-                    image_forward_outs = vision_tower(images.to(vision_tower.dtype), output_hidden_states=True)
-                    select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
                     select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
-                    image_features = select_hidden_state[:, 1:].to(images.dtype)
             if type(images) is list:
-                image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features]
             else:
                 image_features = self.mm_projector(image_features)
-            dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = self.mm_projector(dummy_image_features)
             new_input_embeds = []
@@ -139,7 +152,8 @@ class LlavaLlamaModel(LlamaModel):
             for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
                 if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
                     # multimodal LLM, but the current sample is not multimodal
-                    cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     cur_image_idx += 1
                     continue
@@ -147,32 +161,43 @@ class LlavaLlamaModel(LlamaModel):
                     cur_image_features = image_features[cur_image_idx]
                     num_patches = cur_image_features.shape[0]
                     if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
-                        raise ValueError("The number of image start tokens and image end tokens should be the same.")
-                    image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0]
                     for image_start_token_pos in image_start_tokens:
-                        cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device)
                         num_patches = cur_image_features.shape[0]
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
-                            raise ValueError("The image end token should follow the image start token.")
                         if orig_embeds_params is not None:
-                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
                         else:
-                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
                         cur_image_idx += 1
                     new_input_embeds.append(cur_new_input_embeds)
                 else:
                     cur_image_features = image_features[cur_image_idx]
                     num_patches = cur_image_features.shape[0]
                     if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
-                        raise ValueError("The number of image patch tokens should be the same as the number of image patches.")
-                    masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0]
                     mask_index_start = masked_indices[0]
                     if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
-                        raise ValueError("The image patch tokens should be consecutive.")
                     if orig_embeds_params is not None:
-                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
                     else:
-                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
                     new_input_embeds.append(cur_new_input_embeds)
                     cur_image_idx += 1
             inputs_embeds = torch.stack(new_input_embeds, dim=0)
@@ -184,6 +209,7 @@ class LlavaLlamaModel(LlamaModel):
             return_dict=return_dict
         )
 class EditMapper(nn.Module):
     def __init__(self):
         super().__init__()
@@ -202,6 +228,7 @@ class EditMapper(nn.Module):
         return feat
 class LlavaLlamaForCausalLM(LlamaForCausalLM):
     config_class = LlavaConfig
@@ -209,7 +236,8 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         super(LlamaForCausalLM, self).__init__(config)
         self.model = LlavaLlamaModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.edit_head = EditMapper()
@@ -292,12 +320,15 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         if labels is not None:
             llm = []
             for i in range(labels.shape[0]):
-                try: p = labels[i].data.cpu().tolist().index(32003)-1
-                except: p = len(labels[i])-9
                 p = min(len(hidden_states[i])-9, p)
                 llm.append(hidden_states[i][p:p+8].unsqueeze(0))
             llm = torch.cat(llm, dim=0)
-            hid_edit = self.edit_head(llm, self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             B, DROP = labels.shape[0], 0.05
@@ -305,24 +336,30 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
                                       self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             with torch.no_grad():
-                lat_ans, lat_inp = self.vae.encode(p2p_ans).latent_dist.sample()*self.vae.config.scaling_factor, self.vae.encode(p2p_inp).latent_dist.mode()
                 lat_ans, lat_inp = [torch.from_numpy(lat_ans.data.cpu().float().numpy()).to(lat_ans.device),
                                     torch.from_numpy(lat_inp.data.cpu().float().numpy()).to(lat_inp.device)]
             noise = torch.randn_like(lat_ans)
-            ts = torch.randint(0, self.scheduler.config.num_train_timesteps, (B, ), device=noise.device).long()
             lat_noise = self.scheduler.add_noise(lat_ans, noise, ts)
             prob = torch.rand(B, device=lat_ans.device)
-            mask = (prob<(DROP*2)).reshape(B, 1, 1)
             hid_edit = torch.where(mask, hid_null, hid_edit)
-            mask = (1.0-((prob>=DROP).to(lat_inp.dtype)*(prob<(DROP*3)).to(lat_inp.dtype))).reshape(B, 1, 1, 1)
             lat_inp *= mask
-            out = self.unet(torch.cat([lat_noise, lat_inp], dim=1), ts, hid_edit).sample
-            loss_ce, loss_edit = loss, nn.functional.mse_loss(out, noise, reduction='mean')
-            if int(os.environ['LOCAL_RANK'])==0: print('loss_ce:', loss_ce, '/', 'loss_edit:', loss_edit)
             loss = loss_ce+loss_edit*0.5
         if not return_dict:
@@ -367,9 +404,11 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         self.resize_token_embeddings(len(tokenizer))
         if mm_use_im_start_end:
-            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
             self.resize_token_embeddings(len(tokenizer))
-            vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
             if num_new_tokens > 0:
                 input_embeddings = self.get_input_embeddings().weight.data
@@ -384,14 +423,16 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
             if tune_mm_mlp_adapter:
-                self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)]
                 for p in self.get_input_embeddings().parameters():
                     p.requires_grad = True
                 for p in self.get_output_embeddings().parameters():
                     p.requires_grad = False
             if pretrain_mm_mlp_adapter:
-                mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
                 embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
                 assert num_new_tokens == 2
                 if input_embeddings.shape == embed_tokens_weight.shape:
@@ -399,9 +440,12 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
                 elif embed_tokens_weight.shape[0] == num_new_tokens:
                     input_embeddings[-num_new_tokens:] = embed_tokens_weight
                 else:
-                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
-        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
-AutoConfig.register("llava", LlavaConfig)
 AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)

 from torch.nn import CrossEntropyLoss
 from transformers import AutoConfig, AutoModelForCausalLM, \
+    LlamaConfig, LlamaModel, LlamaForCausalLM, \
+    CLIPVisionModel, CLIPImageProcessor
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+import os
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
 class LlavaConfig(LlamaConfig):
+    model_type = "mg-llava"
 class LlavaLlamaModel(LlamaModel):
         if hasattr(config, "mm_vision_tower"):
             # HACK: for FSDP
+            self.vision_tower = [
+                CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
             # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)
         if hasattr(config, "use_mm_proj"):
+            self.mm_projector = nn.Linear(
+                config.mm_hidden_size, config.hidden_size)
     def get_vision_tower(self):
         vision_tower = getattr(self, 'vision_tower', None)
             self.vision_tower = vision_tower
         vision_config = vision_tower.config
+        num_patches = (vision_config.image_size //
+                       vision_config.patch_size) ** 2
         self.config.use_mm_proj = True
         self.config.mm_hidden_size = vision_config.hidden_size
         self.config.mm_vision_select_layer = mm_vision_select_layer
         if not hasattr(self, 'mm_projector'):
+            self.mm_projector = nn.Linear(
+                vision_config.hidden_size, self.config.hidden_size)
         if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(
+                pretrain_mm_mlp_adapter, map_location='cpu')
+            self.mm_projector.load_state_dict(
+                {k.split('.')[-1]: v for k, v in mm_projector_weights.items()})
         return dict(
             image_processor=image_processor,
                     # variable length images
                     image_features = []
                     for image in images:
+                        image_forward_out = vision_tower(
+                            image.unsqueeze(0), output_hidden_states=True)
+                        select_hidden_state_layer = getattr(
+                            self.config, "mm_vision_select_layer", -1)
                         select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
                         image_feature = select_hidden_state[:, 1:]
                         image_features.append(image_feature)
                 else:
+                    image_forward_outs = vision_tower(
+                        images.to(vision_tower.dtype), output_hidden_states=True)
+                    select_hidden_state_layer = getattr(
+                        self.config, "mm_vision_select_layer", -1)
                     select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
+                    image_features = select_hidden_state[:, 1:].to(
+                        images.dtype)
             if type(images) is list:
+                image_features = [self.mm_projector(
+                    image_feature)[0] for image_feature in image_features]
             else:
                 image_features = self.mm_projector(image_features)
+            dummy_image_features = torch.zeros(
+                256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = self.mm_projector(dummy_image_features)
             new_input_embeds = []
             for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
                 if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
                     # multimodal LLM, but the current sample is not multimodal
+                    cur_input_embeds = cur_input_embeds + \
+                        (0. * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     cur_image_idx += 1
                     continue
                     cur_image_features = image_features[cur_image_idx]
                     num_patches = cur_image_features.shape[0]
                     if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
+                        raise ValueError(
+                            "The number of image start tokens and image end tokens should be the same.")
+                    image_start_tokens = torch.where(
+                        cur_input_ids == vision_tower.config.im_start_token)[0]
                     for image_start_token_pos in image_start_tokens:
+                        cur_image_features = image_features[cur_image_idx].to(
+                            device=cur_input_embeds.device)
                         num_patches = cur_image_features.shape[0]
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
+                            raise ValueError(
+                                "The image end token should follow the image start token.")
                         if orig_embeds_params is not None:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
+                                                             cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
                         else:
+                            cur_new_input_embeds = torch.cat(
+                                (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
                         cur_image_idx += 1
                     new_input_embeds.append(cur_new_input_embeds)
                 else:
                     cur_image_features = image_features[cur_image_idx]
                     num_patches = cur_image_features.shape[0]
                     if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
+                        raise ValueError(
+                            "The number of image patch tokens should be the same as the number of image patches.")
+                    masked_indices = torch.where(
+                        cur_input_ids == vision_tower.config.im_patch_token)[0]
                     mask_index_start = masked_indices[0]
                     if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
+                        raise ValueError(
+                            "The image patch tokens should be consecutive.")
                     if orig_embeds_params is not None:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(
+                        ), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
                     else:
+                        cur_new_input_embeds = torch.cat(
+                            (cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
                     new_input_embeds.append(cur_new_input_embeds)
                     cur_image_idx += 1
             inputs_embeds = torch.stack(new_input_embeds, dim=0)
             return_dict=return_dict
         )
 class EditMapper(nn.Module):
     def __init__(self):
         super().__init__()
         return feat
 class LlavaLlamaForCausalLM(LlamaForCausalLM):
     config_class = LlavaConfig
         super(LlamaForCausalLM, self).__init__(config)
         self.model = LlavaLlamaModel(config)
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
         self.edit_head = EditMapper()
         if labels is not None:
             llm = []
             for i in range(labels.shape[0]):
+                try:
+                    p = labels[i].data.cpu().tolist().index(32003)-1
+                except:
+                    p = len(labels[i])-9
                 p = min(len(hidden_states[i])-9, p)
                 llm.append(hidden_states[i][p:p+8].unsqueeze(0))
             llm = torch.cat(llm, dim=0)
+            hid_edit = self.edit_head(
+                llm, self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             B, DROP = labels.shape[0], 0.05
                                       self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             with torch.no_grad():
+                lat_ans, lat_inp = self.vae.encode(p2p_ans).latent_dist.sample(
+                )*self.vae.config.scaling_factor, self.vae.encode(p2p_inp).latent_dist.mode()
                 lat_ans, lat_inp = [torch.from_numpy(lat_ans.data.cpu().float().numpy()).to(lat_ans.device),
                                     torch.from_numpy(lat_inp.data.cpu().float().numpy()).to(lat_inp.device)]
             noise = torch.randn_like(lat_ans)
+            ts = torch.randint(
+                0, self.scheduler.config.num_train_timesteps, (B, ), device=noise.device).long()
             lat_noise = self.scheduler.add_noise(lat_ans, noise, ts)
             prob = torch.rand(B, device=lat_ans.device)
+            mask = (prob < (DROP*2)).reshape(B, 1, 1)
             hid_edit = torch.where(mask, hid_null, hid_edit)
+            mask = (1.0-((prob >= DROP).to(lat_inp.dtype) *
+                    (prob < (DROP*3)).to(lat_inp.dtype))).reshape(B, 1, 1, 1)
             lat_inp *= mask
+            out = self.unet(
+                torch.cat([lat_noise, lat_inp], dim=1), ts, hid_edit).sample
+            loss_ce, loss_edit = loss, nn.functional.mse_loss(
+                out, noise, reduction='mean')
+            if int(os.environ['LOCAL_RANK']) == 0:
+                print('loss_ce:', loss_ce, '/', 'loss_edit:', loss_edit)
             loss = loss_ce+loss_edit*0.5
         if not return_dict:
         self.resize_token_embeddings(len(tokenizer))
         if mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
             self.resize_token_embeddings(len(tokenizer))
+            vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
             if num_new_tokens > 0:
                 input_embeddings = self.get_input_embeddings().weight.data
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
             if tune_mm_mlp_adapter:
+                self.get_model().orig_embeds_params = [
+                    self.get_input_embeddings().weight.data.clone().to(device=device)]
                 for p in self.get_input_embeddings().parameters():
                     p.requires_grad = True
                 for p in self.get_output_embeddings().parameters():
                     p.requires_grad = False
             if pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    pretrain_mm_mlp_adapter, map_location='cpu')
                 embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
                 assert num_new_tokens == 2
                 if input_embeddings.shape == embed_tokens_weight.shape:
                 elif embed_tokens_weight.shape[0] == num_new_tokens:
                     input_embeddings[-num_new_tokens:] = embed_tokens_weight
                 else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+AutoConfig.register("mg-llava", LlavaConfig)
 AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)