Fix weight init, deprecation warnings, safetensors (#29)
Browse files- Update weights and modeling file (0d9634b5410c7947cf54ca344be5ad328240721f)
- Tie weights in code, fix updated weights (58c9f97c2a8448696851e2cc95eb1b6919493fe4)
- model.safetensors +3 -0
- modeling_florence2.py +22 -23
    	
        model.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:58757d657ff44051314c8030b68e04cb1bb618ca9a4885418f111f6fb708185a
         | 
| 3 | 
            +
            size 463221266
         | 
    	
        modeling_florence2.py
    CHANGED
    
    | @@ -26,9 +26,10 @@ import torch.utils.checkpoint as checkpoint | |
| 26 | 
             
            from torch.nn import CrossEntropyLoss 
         | 
| 27 | 
             
            from collections import OrderedDict
         | 
| 28 | 
             
            from einops import rearrange
         | 
| 29 | 
            -
            from timm. | 
| 30 |  | 
| 31 | 
             
            from transformers.modeling_utils import PreTrainedModel
         | 
|  | |
| 32 | 
             
            from transformers.utils import (
         | 
| 33 | 
             
                ModelOutput,
         | 
| 34 | 
             
                add_start_docstrings,
         | 
| @@ -609,29 +610,10 @@ class DaViT(nn.Module): | |
| 609 | 
             
                    self.avgpool = nn.AdaptiveAvgPool1d(1)
         | 
| 610 | 
             
                    self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
         | 
| 611 |  | 
| 612 | 
            -
                    self.apply(self._init_weights)
         | 
| 613 | 
            -
             | 
| 614 | 
             
                @property
         | 
| 615 | 
             
                def dim_out(self):
         | 
| 616 | 
             
                    return self.embed_dims[-1]
         | 
| 617 |  | 
| 618 | 
            -
                def _init_weights(self, m):
         | 
| 619 | 
            -
                    if isinstance(m, nn.Linear):
         | 
| 620 | 
            -
                        trunc_normal_(m.weight, std=0.02)
         | 
| 621 | 
            -
                        if m.bias is not None:
         | 
| 622 | 
            -
                            nn.init.constant_(m.bias, 0)
         | 
| 623 | 
            -
                    elif isinstance(m, nn.Conv2d):
         | 
| 624 | 
            -
                        nn.init.normal_(m.weight, std=0.02)
         | 
| 625 | 
            -
                        for name, _ in m.named_parameters():
         | 
| 626 | 
            -
                            if name in ['bias']:
         | 
| 627 | 
            -
                                nn.init.constant_(m.bias, 0)
         | 
| 628 | 
            -
                    elif isinstance(m, nn.LayerNorm):
         | 
| 629 | 
            -
                        nn.init.constant_(m.weight, 1.0)
         | 
| 630 | 
            -
                        nn.init.constant_(m.bias, 0)
         | 
| 631 | 
            -
                    elif isinstance(m, nn.BatchNorm2d):
         | 
| 632 | 
            -
                        nn.init.constant_(m.weight, 1.0)
         | 
| 633 | 
            -
                        nn.init.constant_(m.bias, 0)
         | 
| 634 | 
            -
             | 
| 635 | 
             
                def forward_features_unpool(self, x):
         | 
| 636 | 
             
                    """
         | 
| 637 | 
             
                    forward until avg pooling 
         | 
| @@ -1450,6 +1432,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel): | |
| 1450 | 
             
                        module.weight.data.normal_(mean=0.0, std=std)
         | 
| 1451 | 
             
                        if module.padding_idx is not None:
         | 
| 1452 | 
             
                            module.weight.data[module.padding_idx].zero_()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1453 |  | 
| 1454 | 
             
                @property
         | 
| 1455 | 
             
                def dummy_inputs(self):
         | 
| @@ -2059,7 +2052,7 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel): | |
| 2059 | 
             
                    )
         | 
| 2060 |  | 
| 2061 |  | 
| 2062 | 
            -
            class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
         | 
| 2063 | 
             
                base_model_prefix = "model"
         | 
| 2064 | 
             
                _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
         | 
| 2065 | 
             
                _keys_to_ignore_on_load_missing = ["final_logits_bias"]
         | 
| @@ -2073,6 +2066,12 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel | |
| 2073 | 
             
                    # Initialize weights and apply final processing
         | 
| 2074 | 
             
                    self.post_init()
         | 
| 2075 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2076 | 
             
                def get_encoder(self):
         | 
| 2077 | 
             
                    return self.model.get_encoder()
         | 
| 2078 |  | 
| @@ -2530,6 +2529,8 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel): | |
| 2530 | 
             
                FLORENCE2_START_DOCSTRING,
         | 
| 2531 | 
             
            )
         | 
| 2532 | 
             
            class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         | 
|  | |
|  | |
| 2533 | 
             
                def __init__(self, config: Florence2Config):
         | 
| 2534 | 
             
                    super().__init__(config)
         | 
| 2535 | 
             
                    assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         | 
| @@ -2544,8 +2545,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel): | |
| 2544 |  | 
| 2545 | 
             
                    language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
         | 
| 2546 |  | 
| 2547 | 
            -
                    if language_model._tied_weights_keys is not None:
         | 
| 2548 | 
            -
                        self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         | 
| 2549 | 
             
                    self.language_model = language_model
         | 
| 2550 |  | 
| 2551 | 
             
                    self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         | 
|  | |
| 26 | 
             
            from torch.nn import CrossEntropyLoss 
         | 
| 27 | 
             
            from collections import OrderedDict
         | 
| 28 | 
             
            from einops import rearrange
         | 
| 29 | 
            +
            from timm.layers import DropPath, trunc_normal_
         | 
| 30 |  | 
| 31 | 
             
            from transformers.modeling_utils import PreTrainedModel
         | 
| 32 | 
            +
            from transformers.generation.utils import GenerationMixin
         | 
| 33 | 
             
            from transformers.utils import (
         | 
| 34 | 
             
                ModelOutput,
         | 
| 35 | 
             
                add_start_docstrings,
         | 
|  | |
| 610 | 
             
                    self.avgpool = nn.AdaptiveAvgPool1d(1)
         | 
| 611 | 
             
                    self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
         | 
| 612 |  | 
|  | |
|  | |
| 613 | 
             
                @property
         | 
| 614 | 
             
                def dim_out(self):
         | 
| 615 | 
             
                    return self.embed_dims[-1]
         | 
| 616 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 617 | 
             
                def forward_features_unpool(self, x):
         | 
| 618 | 
             
                    """
         | 
| 619 | 
             
                    forward until avg pooling 
         | 
|  | |
| 1432 | 
             
                        module.weight.data.normal_(mean=0.0, std=std)
         | 
| 1433 | 
             
                        if module.padding_idx is not None:
         | 
| 1434 | 
             
                            module.weight.data[module.padding_idx].zero_()
         | 
| 1435 | 
            +
                    elif isinstance(module, nn.Conv2d):
         | 
| 1436 | 
            +
                        nn.init.normal_(module.weight, std=0.02)
         | 
| 1437 | 
            +
                        for name, _ in module.named_parameters():
         | 
| 1438 | 
            +
                            if name == "bias":
         | 
| 1439 | 
            +
                                nn.init.constant_(module.bias, 0)
         | 
| 1440 | 
            +
                    elif isinstance(module, nn.LayerNorm):
         | 
| 1441 | 
            +
                        nn.init.constant_(module.weight, 1.0)
         | 
| 1442 | 
            +
                        nn.init.constant_(module.bias, 0)
         | 
| 1443 | 
            +
                    elif isinstance(module, nn.BatchNorm2d):
         | 
| 1444 | 
            +
                        nn.init.constant_(module.weight, 1.0)
         | 
| 1445 | 
            +
                        nn.init.constant_(module.bias, 0)
         | 
| 1446 |  | 
| 1447 | 
             
                @property
         | 
| 1448 | 
             
                def dummy_inputs(self):
         | 
|  | |
| 2052 | 
             
                    )
         | 
| 2053 |  | 
| 2054 |  | 
| 2055 | 
            +
            class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
         | 
| 2056 | 
             
                base_model_prefix = "model"
         | 
| 2057 | 
             
                _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
         | 
| 2058 | 
             
                _keys_to_ignore_on_load_missing = ["final_logits_bias"]
         | 
|  | |
| 2066 | 
             
                    # Initialize weights and apply final processing
         | 
| 2067 | 
             
                    self.post_init()
         | 
| 2068 |  | 
| 2069 | 
            +
                def _tie_weights(self):
         | 
| 2070 | 
            +
                    if self.config.tie_word_embeddings:
         | 
| 2071 | 
            +
                        self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared)
         | 
| 2072 | 
            +
                        self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared)
         | 
| 2073 | 
            +
                        self._tie_or_clone_weights(self.lm_head, self.model.shared)
         | 
| 2074 | 
            +
             | 
| 2075 | 
             
                def get_encoder(self):
         | 
| 2076 | 
             
                    return self.model.get_encoder()
         | 
| 2077 |  | 
|  | |
| 2529 | 
             
                FLORENCE2_START_DOCSTRING,
         | 
| 2530 | 
             
            )
         | 
| 2531 | 
             
            class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         | 
| 2532 | 
            +
                _tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight", "language_model.lm_head.weight"]
         | 
| 2533 | 
            +
             | 
| 2534 | 
             
                def __init__(self, config: Florence2Config):
         | 
| 2535 | 
             
                    super().__init__(config)
         | 
| 2536 | 
             
                    assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         | 
|  | |
| 2545 |  | 
| 2546 | 
             
                    language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
         | 
| 2547 |  | 
|  | |
|  | |
| 2548 | 
             
                    self.language_model = language_model
         | 
| 2549 |  | 
| 2550 | 
             
                    self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         | 
