Add files using upload-large-folder tool

Browse files

Files changed (6) hide show

config.json +6 -4
config_molmo.py +6 -5
model-00007-of-00007.safetensors +2 -2
model.safetensors.index.json +3 -2
modeling_molmo.py +34 -30
preprocessing_molmo.py +1 -3

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "_name_or_path": "/data/chris/hf/7b-v3",
   "architectures": [
-    "MOLMoForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "config_molmo.MolmoConfig",
-    "AutoModelForCausalLM": "modeling_molmo.MOLMoForCausalLM"
   },
   "clip_qkv": null,
   "embedding_size": 152064,
@@ -13,8 +13,10 @@
   "initializer_range": 0.02,
   "intermediate_size": 37888,
   "layer_norm_eps": 1e-06,
   "max_position_embeddings": 4096,
   "model_type": "molmo",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
@@ -27,4 +29,4 @@
   "use_position_ids": true,
   "vocab_size": 152064,
   "weight_tying": false
-}

 {
   "architectures": [
+    "MolmoForCausalLM"
   ],
+  "attention_layer_norm": false,
   "auto_map": {
     "AutoConfig": "config_molmo.MolmoConfig",
+    "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
   },
   "clip_qkv": null,
   "embedding_size": 152064,
   "initializer_range": 0.02,
   "intermediate_size": 37888,
   "layer_norm_eps": 1e-06,
+  "layer_norm_type": "rms",
   "max_position_embeddings": 4096,
   "model_type": "molmo",
+  "norm_after": false,
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
   "use_position_ids": true,
   "vocab_size": 152064,
   "weight_tying": false
+}

config_molmo.py CHANGED Viewed

@@ -26,6 +26,9 @@ class MolmoConfig(PretrainedConfig):
         weight_tying: bool = False,
         use_position_ids: bool=True,
         tie_word_embeddings: bool=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -38,18 +41,16 @@ class MolmoConfig(PretrainedConfig):
         self.layer_norm_eps = layer_norm_eps
         self.weight_tying = weight_tying
         self.use_position_ids = use_position_ids
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.clip_qkv = clip_qkv
         self.qkv_bias = qkv_bias
         self.tie_word_embeddings = tie_word_embeddings
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,

         weight_tying: bool = False,
         use_position_ids: bool=True,
         tie_word_embeddings: bool=True,
+        attention_layer_norm: bool=False,
+        norm_after: bool = False,
+        layer_norm_type: str="rms",
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.weight_tying = weight_tying
         self.use_position_ids = use_position_ids
+        self.attention_layer_norm = attention_layer_norm
         self.num_key_value_heads = num_key_value_heads
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.clip_qkv = clip_qkv
         self.qkv_bias = qkv_bias
+        self.norm_after = norm_after
         self.tie_word_embeddings = tie_word_embeddings
+        self.layer_norm_type = layer_norm_type
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,

model-00007-of-00007.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:225404ed62c82967fdbaf7a17a8024d0b1af37c55f7469b2196b5c7943c93955
-size 3799830480

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c84ff3f7adcfdf9eec4247291ca1fcad02cf7005c84801f31223711df54846a
+size 3799846968

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 32084084736
   },
   "weight_map": {
     "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
@@ -586,6 +586,7 @@
     "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
     "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
     "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
-    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors"
   }
 }

 {
   "metadata": {
+    "total_size": 32084101120
   },
   "weight_map": {
     "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
     "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
     "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
     "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
+    "model.vision_backbone.pad_embed": "model-00007-of-00007.safetensors"
   }
 }

modeling_molmo.py CHANGED Viewed

@@ -77,7 +77,7 @@ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: b
         x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
-class OLMoConfigurationError(Exception):
     pass
@@ -189,7 +189,7 @@ class RotaryEmbedding(nn.Module):
         return q_.type_as(q), k_.type_as(k)
-class OLMoBlock(nn.Module):
     """
     A base class for transformer block implementations.
     """
@@ -420,17 +420,17 @@ class OLMoBlock(nn.Module):
     @classmethod
     def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
         if config.block_type == "sequential":
-            return OLMoSequentialBlock(layer_id, config, cache)
         elif config.block_type == "llama":
             return OLMoLlamaBlock(layer_id, config, cache)
         else:
             raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
-class OLMoLlamaBlock(OLMoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
-    (plus another skip connection). This block is similar to `OLMoSequentialBlock`
     but some operations have slightly different implementations to imitate the
     behavior of Llama.
     """
@@ -598,7 +598,7 @@ class OLMoLlamaBlock(OLMoBlock):
         return x, cache
-class OLMoSequentialBlock(OLMoBlock):
     """
     This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
@@ -825,7 +825,6 @@ class VisionBackboneConfig:
 class FullMolmoConfig:
     d_model: int = 768
     n_heads: int = 12
-    head_dim: int = 64
     n_kv_heads: Optional[int] = None
     qkv_bias: bool = False
     clip_qkv: Optional[float] = None
@@ -908,7 +907,7 @@ class FullMolmoConfig:
             if self.n_kv_heads == n_kv_heads_should_be:
                 return n_kv_heads_should_be
             else:
-                raise OLMoConfigurationError(
                     "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
                 )
@@ -1897,7 +1896,7 @@ class LayerNorm(LayerNormBase):
             return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
-class MOLMo(nn.Module):
     def __init__(self, config: FullMolmoConfig, init_params: bool = True):
         super().__init__()
         self.config = config
@@ -1906,7 +1905,7 @@ class MOLMo(nn.Module):
         # Validate config.
         if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
             if self.config.embedding_size < self.config.vocab_size:
-                raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
             elif self.config.embedding_size % 128 != 0:
                 import warnings
@@ -1939,7 +1938,7 @@ class MOLMo(nn.Module):
             )
         )
-        blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
         if self.config.block_group_size > 1:
             raise NotImplementedError()
         else:
@@ -2018,16 +2017,20 @@ class MOLMo(nn.Module):
             which input IDs are masked. A `1` value in the mask means that
             the corresponding input ID should *not* be ignored. A `0` means
             that the corresponding input ID is masked.
             This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
             library.
         :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
             `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
             to introduce causal or other biases.
             If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
             indicates that the i-th element in the sequence is allowed to attend to the j-th
             element in the sequence.
             If the tensor is a float tensor, it will just be added to the attention
             scores before the softmax.
             The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
         :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
             the response mask. A `1` value in the mask means that the corresponding token
@@ -2258,20 +2261,24 @@ class MOLMo(nn.Module):
         return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
-class MOLMoForCausalLM(PreTrainedModel):
     config_class = MolmoConfig
     base_model_prefix = "model"
-    _no_split_modules = ["OLMoBlock"]
-    def __init__(self, config: MolmoConfig, model: Optional[MOLMo] = None, init_params: bool = False):
         super().__init__(config)
         if not model:
             full_config = FullMolmoConfig(
                 rope_impl="llama",
                 vocab_size=config.vocab_size,
                 max_sequence_length=config.max_position_embeddings,
                 qkv_bias=config.qkv_bias,
                 embedding_size=config.embedding_size,
                 attention_type="sdpa",
                 embedding_dropout=0,
@@ -2287,9 +2294,9 @@ class MOLMoForCausalLM(PreTrainedModel):
                 additional_vocab_size=128,
                 n_heads=config.num_attention_heads,
                 n_kv_heads=config.num_key_value_heads,
-                rope_theta=1000000.0,
-                layer_norm_eps=1e-6,
-                layer_norm_type="rms",
                 pad_tokenizer=True,
                 vit_layers=[-2, -9],
                 vision_backbone=VisionBackboneConfig(
@@ -2312,7 +2319,7 @@ class MOLMoForCausalLM(PreTrainedModel):
                     initializer_range=0.02,
                 )
             )
-            self.model = MOLMo(full_config, init_params=init_params)
         else:
             self.model = model
@@ -2345,7 +2352,7 @@ class MOLMoForCausalLM(PreTrainedModel):
             use_cache = self.config.use_cache
         if output_attentions:
-            raise ValueError("output_attentions is not yet supported in OLMo")
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2524,16 +2531,6 @@ class MOLMoForCausalLM(PreTrainedModel):
         model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
         return model_kwargs
-    # TODO: these are required to make the implementation complete.
-    # def resize_position_embeddings(self, new_num_position_embeddings: int):
-    #     pass
-    #
-    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
-    #     pass
-    #
-    # def _reorder_cache(self, past_key_values, beam_idx):
-    #     pass
     def get_input_embeddings(self) -> torch.nn.Module:
         return self.model.transformer.wte
@@ -2555,11 +2552,13 @@ class MOLMoForCausalLM(PreTrainedModel):
     def tie_weights(self):
         """
         This function is intentionally left as a no-op.
         Weight tying is handled as follows:
         - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
         See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
         - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
         See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
         Therefore, there is no need to explicitly tie the weights in this function.
         """
         pass
@@ -2569,7 +2568,9 @@ class MOLMoForCausalLM(PreTrainedModel):
     ) -> torch.nn.Embedding:
         """
         Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
         Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
         Arguments:
             new_num_tokens (`int`, *optional*):
                 The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
@@ -2578,12 +2579,15 @@ class MOLMoForCausalLM(PreTrainedModel):
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                 `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                 details about this, or help on choosing the correct value for resizing, refer to this guide:
                 https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
         Return:
             `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         Note:
             This method differs from the base class implementation by resizing the `embedding_size` attribute of the
             model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
@@ -2614,4 +2618,4 @@ class MOLMoForCausalLM(PreTrainedModel):
 # Always register for multi-modal features
-AutoModelForCausalLM.register(MolmoConfig, MOLMoForCausalLM)

         x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+class MolmoConfigurationError(Exception):
     pass
         return q_.type_as(q), k_.type_as(k)
+class MolmoBlock(nn.Module):
     """
     A base class for transformer block implementations.
     """
     @classmethod
     def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
         if config.block_type == "sequential":
+            return MolmoSequentialBlock(layer_id, config, cache)
         elif config.block_type == "llama":
             return OLMoLlamaBlock(layer_id, config, cache)
         else:
             raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class OLMoLlamaBlock(MolmoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `MolmoSequentialBlock`
     but some operations have slightly different implementations to imitate the
     behavior of Llama.
     """
         return x, cache
+class MolmoSequentialBlock(MolmoBlock):
     """
     This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
 class FullMolmoConfig:
     d_model: int = 768
     n_heads: int = 12
     n_kv_heads: Optional[int] = None
     qkv_bias: bool = False
     clip_qkv: Optional[float] = None
             if self.n_kv_heads == n_kv_heads_should_be:
                 return n_kv_heads_should_be
             else:
+                raise MolmoConfigurationError(
                     "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
                 )
             return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class Molmo(nn.Module):
     def __init__(self, config: FullMolmoConfig, init_params: bool = True):
         super().__init__()
         self.config = config
         # Validate config.
         if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
             if self.config.embedding_size < self.config.vocab_size:
+                raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
             elif self.config.embedding_size % 128 != 0:
                 import warnings
             )
         )
+        blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
         if self.config.block_group_size > 1:
             raise NotImplementedError()
         else:
             which input IDs are masked. A `1` value in the mask means that
             the corresponding input ID should *not* be ignored. A `0` means
             that the corresponding input ID is masked.
             This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
             library.
         :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
             `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
             to introduce causal or other biases.
             If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
             indicates that the i-th element in the sequence is allowed to attend to the j-th
             element in the sequence.
             If the tensor is a float tensor, it will just be added to the attention
             scores before the softmax.
             The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
         :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
             the response mask. A `1` value in the mask means that the corresponding token
         return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+class MolmoForCausalLM(PreTrainedModel):
     config_class = MolmoConfig
     base_model_prefix = "model"
+    _no_split_modules = ["MolmoBlock"]
+    def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
         super().__init__(config)
         if not model:
             full_config = FullMolmoConfig(
+                attention_layer_norm=config.attention_layer_norm,
+                image_padding_embed="pad_and_partial_pad",
+                image_pooling_2d="attention-meanq",
                 rope_impl="llama",
                 vocab_size=config.vocab_size,
                 max_sequence_length=config.max_position_embeddings,
                 qkv_bias=config.qkv_bias,
+                norm_after=config.norm_after,
                 embedding_size=config.embedding_size,
                 attention_type="sdpa",
                 embedding_dropout=0,
                 additional_vocab_size=128,
                 n_heads=config.num_attention_heads,
                 n_kv_heads=config.num_key_value_heads,
+                rope_theta=config.rope_theta,
+                layer_norm_eps=config.layer_norm_eps,
+                layer_norm_type=config.layer_norm_type,
                 pad_tokenizer=True,
                 vit_layers=[-2, -9],
                 vision_backbone=VisionBackboneConfig(
                     initializer_range=0.02,
                 )
             )
+            self.model = Molmo(full_config, init_params=init_params)
         else:
             self.model = model
             use_cache = self.config.use_cache
         if output_attentions:
+            raise ValueError("output_attentions is not yet supported in Molmo")
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
         return model_kwargs
     def get_input_embeddings(self) -> torch.nn.Module:
         return self.model.transformer.wte
     def tie_weights(self):
         """
         This function is intentionally left as a no-op.
         Weight tying is handled as follows:
         - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
         See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
         - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
         See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
         Therefore, there is no need to explicitly tie the weights in this function.
         """
         pass
     ) -> torch.nn.Embedding:
         """
         Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
         Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
         Arguments:
             new_num_tokens (`int`, *optional*):
                 The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                 `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                 details about this, or help on choosing the correct value for resizing, refer to this guide:
                 https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
         Return:
             `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         Note:
             This method differs from the base class implementation by resizing the `embedding_size` attribute of the
             model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
 # Always register for multi-modal features
+AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)

preprocessing_molmo.py CHANGED Viewed

@@ -2,9 +2,7 @@
 Processor class for Molmo.
 """
-from typing import List, Union, Optional
-from transformers.utils.constants import OPENAI_CLIP_STD, OPENAI_CLIP_MEAN
 try:
     from typing import Unpack

 Processor class for Molmo.
 """
+from typing import Optional
 try:
     from typing import Unpack