Add files using upload-large-folder tool
Browse files- config.json +6 -4
- config_molmo.py +6 -5
- model-00007-of-00007.safetensors +2 -2
- model.safetensors.index.json +3 -2
- modeling_molmo.py +34 -30
- preprocessing_molmo.py +1 -3
config.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "/data/chris/hf/7b-v3",
|
| 3 |
"architectures": [
|
| 4 |
-
"
|
| 5 |
],
|
|
|
|
| 6 |
"auto_map": {
|
| 7 |
"AutoConfig": "config_molmo.MolmoConfig",
|
| 8 |
-
"AutoModelForCausalLM": "modeling_molmo.
|
| 9 |
},
|
| 10 |
"clip_qkv": null,
|
| 11 |
"embedding_size": 152064,
|
|
@@ -13,8 +13,10 @@
|
|
| 13 |
"initializer_range": 0.02,
|
| 14 |
"intermediate_size": 37888,
|
| 15 |
"layer_norm_eps": 1e-06,
|
|
|
|
| 16 |
"max_position_embeddings": 4096,
|
| 17 |
"model_type": "molmo",
|
|
|
|
| 18 |
"num_attention_heads": 28,
|
| 19 |
"num_hidden_layers": 28,
|
| 20 |
"num_key_value_heads": 4,
|
|
@@ -27,4 +29,4 @@
|
|
| 27 |
"use_position_ids": true,
|
| 28 |
"vocab_size": 152064,
|
| 29 |
"weight_tying": false
|
| 30 |
-
}
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
+
"MolmoForCausalLM"
|
| 4 |
],
|
| 5 |
+
"attention_layer_norm": false,
|
| 6 |
"auto_map": {
|
| 7 |
"AutoConfig": "config_molmo.MolmoConfig",
|
| 8 |
+
"AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
|
| 9 |
},
|
| 10 |
"clip_qkv": null,
|
| 11 |
"embedding_size": 152064,
|
|
|
|
| 13 |
"initializer_range": 0.02,
|
| 14 |
"intermediate_size": 37888,
|
| 15 |
"layer_norm_eps": 1e-06,
|
| 16 |
+
"layer_norm_type": "rms",
|
| 17 |
"max_position_embeddings": 4096,
|
| 18 |
"model_type": "molmo",
|
| 19 |
+
"norm_after": false,
|
| 20 |
"num_attention_heads": 28,
|
| 21 |
"num_hidden_layers": 28,
|
| 22 |
"num_key_value_heads": 4,
|
|
|
|
| 29 |
"use_position_ids": true,
|
| 30 |
"vocab_size": 152064,
|
| 31 |
"weight_tying": false
|
| 32 |
+
}
|
config_molmo.py
CHANGED
|
@@ -26,6 +26,9 @@ class MolmoConfig(PretrainedConfig):
|
|
| 26 |
weight_tying: bool = False,
|
| 27 |
use_position_ids: bool=True,
|
| 28 |
tie_word_embeddings: bool=True,
|
|
|
|
|
|
|
|
|
|
| 29 |
**kwargs,
|
| 30 |
):
|
| 31 |
self.vocab_size = vocab_size
|
|
@@ -38,18 +41,16 @@ class MolmoConfig(PretrainedConfig):
|
|
| 38 |
self.layer_norm_eps = layer_norm_eps
|
| 39 |
self.weight_tying = weight_tying
|
| 40 |
self.use_position_ids = use_position_ids
|
| 41 |
-
|
| 42 |
-
# for backward compatibility
|
| 43 |
-
if num_key_value_heads is None:
|
| 44 |
-
num_key_value_heads = num_attention_heads
|
| 45 |
-
|
| 46 |
self.num_key_value_heads = num_key_value_heads
|
| 47 |
self.initializer_range = initializer_range
|
| 48 |
self.use_cache = use_cache
|
| 49 |
self.rope_theta = rope_theta
|
| 50 |
self.clip_qkv = clip_qkv
|
| 51 |
self.qkv_bias = qkv_bias
|
|
|
|
| 52 |
self.tie_word_embeddings = tie_word_embeddings
|
|
|
|
| 53 |
|
| 54 |
super().__init__(
|
| 55 |
tie_word_embeddings=tie_word_embeddings,
|
|
|
|
| 26 |
weight_tying: bool = False,
|
| 27 |
use_position_ids: bool=True,
|
| 28 |
tie_word_embeddings: bool=True,
|
| 29 |
+
attention_layer_norm: bool=False,
|
| 30 |
+
norm_after: bool = False,
|
| 31 |
+
layer_norm_type: str="rms",
|
| 32 |
**kwargs,
|
| 33 |
):
|
| 34 |
self.vocab_size = vocab_size
|
|
|
|
| 41 |
self.layer_norm_eps = layer_norm_eps
|
| 42 |
self.weight_tying = weight_tying
|
| 43 |
self.use_position_ids = use_position_ids
|
| 44 |
+
self.attention_layer_norm = attention_layer_norm
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
self.num_key_value_heads = num_key_value_heads
|
| 46 |
self.initializer_range = initializer_range
|
| 47 |
self.use_cache = use_cache
|
| 48 |
self.rope_theta = rope_theta
|
| 49 |
self.clip_qkv = clip_qkv
|
| 50 |
self.qkv_bias = qkv_bias
|
| 51 |
+
self.norm_after = norm_after
|
| 52 |
self.tie_word_embeddings = tie_word_embeddings
|
| 53 |
+
self.layer_norm_type = layer_norm_type
|
| 54 |
|
| 55 |
super().__init__(
|
| 56 |
tie_word_embeddings=tie_word_embeddings,
|
model-00007-of-00007.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c84ff3f7adcfdf9eec4247291ca1fcad02cf7005c84801f31223711df54846a
|
| 3 |
+
size 3799846968
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"total_size":
|
| 4 |
},
|
| 5 |
"weight_map": {
|
| 6 |
"model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
|
|
@@ -586,6 +586,7 @@
|
|
| 586 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
|
| 587 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
|
| 588 |
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
|
| 589 |
-
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors"
|
|
|
|
| 590 |
}
|
| 591 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_size": 32084101120
|
| 4 |
},
|
| 5 |
"weight_map": {
|
| 6 |
"model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
|
|
|
|
| 586 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
|
| 587 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
|
| 588 |
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
|
| 589 |
+
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
|
| 590 |
+
"model.vision_backbone.pad_embed": "model-00007-of-00007.safetensors"
|
| 591 |
}
|
| 592 |
}
|
modeling_molmo.py
CHANGED
|
@@ -77,7 +77,7 @@ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: b
|
|
| 77 |
x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
|
| 78 |
|
| 79 |
|
| 80 |
-
class
|
| 81 |
pass
|
| 82 |
|
| 83 |
|
|
@@ -189,7 +189,7 @@ class RotaryEmbedding(nn.Module):
|
|
| 189 |
return q_.type_as(q), k_.type_as(k)
|
| 190 |
|
| 191 |
|
| 192 |
-
class
|
| 193 |
"""
|
| 194 |
A base class for transformer block implementations.
|
| 195 |
"""
|
|
@@ -420,17 +420,17 @@ class OLMoBlock(nn.Module):
|
|
| 420 |
@classmethod
|
| 421 |
def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
|
| 422 |
if config.block_type == "sequential":
|
| 423 |
-
return
|
| 424 |
elif config.block_type == "llama":
|
| 425 |
return OLMoLlamaBlock(layer_id, config, cache)
|
| 426 |
else:
|
| 427 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
| 428 |
|
| 429 |
|
| 430 |
-
class OLMoLlamaBlock(
|
| 431 |
"""
|
| 432 |
This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
| 433 |
-
(plus another skip connection). This block is similar to `
|
| 434 |
but some operations have slightly different implementations to imitate the
|
| 435 |
behavior of Llama.
|
| 436 |
"""
|
|
@@ -598,7 +598,7 @@ class OLMoLlamaBlock(OLMoBlock):
|
|
| 598 |
return x, cache
|
| 599 |
|
| 600 |
|
| 601 |
-
class
|
| 602 |
"""
|
| 603 |
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
| 604 |
(plus another skip connection).
|
|
@@ -825,7 +825,6 @@ class VisionBackboneConfig:
|
|
| 825 |
class FullMolmoConfig:
|
| 826 |
d_model: int = 768
|
| 827 |
n_heads: int = 12
|
| 828 |
-
head_dim: int = 64
|
| 829 |
n_kv_heads: Optional[int] = None
|
| 830 |
qkv_bias: bool = False
|
| 831 |
clip_qkv: Optional[float] = None
|
|
@@ -908,7 +907,7 @@ class FullMolmoConfig:
|
|
| 908 |
if self.n_kv_heads == n_kv_heads_should_be:
|
| 909 |
return n_kv_heads_should_be
|
| 910 |
else:
|
| 911 |
-
raise
|
| 912 |
"You can't set `multi_query_attention` and `n_kv_heads` at the same time."
|
| 913 |
)
|
| 914 |
|
|
@@ -1897,7 +1896,7 @@ class LayerNorm(LayerNormBase):
|
|
| 1897 |
return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
|
| 1898 |
|
| 1899 |
|
| 1900 |
-
class
|
| 1901 |
def __init__(self, config: FullMolmoConfig, init_params: bool = True):
|
| 1902 |
super().__init__()
|
| 1903 |
self.config = config
|
|
@@ -1906,7 +1905,7 @@ class MOLMo(nn.Module):
|
|
| 1906 |
# Validate config.
|
| 1907 |
if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
|
| 1908 |
if self.config.embedding_size < self.config.vocab_size:
|
| 1909 |
-
raise
|
| 1910 |
elif self.config.embedding_size % 128 != 0:
|
| 1911 |
import warnings
|
| 1912 |
|
|
@@ -1939,7 +1938,7 @@ class MOLMo(nn.Module):
|
|
| 1939 |
)
|
| 1940 |
)
|
| 1941 |
|
| 1942 |
-
blocks = [
|
| 1943 |
if self.config.block_group_size > 1:
|
| 1944 |
raise NotImplementedError()
|
| 1945 |
else:
|
|
@@ -2018,16 +2017,20 @@ class MOLMo(nn.Module):
|
|
| 2018 |
which input IDs are masked. A `1` value in the mask means that
|
| 2019 |
the corresponding input ID should *not* be ignored. A `0` means
|
| 2020 |
that the corresponding input ID is masked.
|
|
|
|
| 2021 |
This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
|
| 2022 |
library.
|
| 2023 |
:param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
|
| 2024 |
`(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
|
| 2025 |
to introduce causal or other biases.
|
|
|
|
| 2026 |
If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
|
| 2027 |
indicates that the i-th element in the sequence is allowed to attend to the j-th
|
| 2028 |
element in the sequence.
|
|
|
|
| 2029 |
If the tensor is a float tensor, it will just be added to the attention
|
| 2030 |
scores before the softmax.
|
|
|
|
| 2031 |
The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
|
| 2032 |
:param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
|
| 2033 |
the response mask. A `1` value in the mask means that the corresponding token
|
|
@@ -2258,20 +2261,24 @@ class MOLMo(nn.Module):
|
|
| 2258 |
return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
|
| 2259 |
|
| 2260 |
|
| 2261 |
-
class
|
| 2262 |
config_class = MolmoConfig
|
| 2263 |
base_model_prefix = "model"
|
| 2264 |
-
_no_split_modules = ["
|
| 2265 |
|
| 2266 |
-
def __init__(self, config: MolmoConfig, model: Optional[
|
| 2267 |
super().__init__(config)
|
| 2268 |
|
| 2269 |
if not model:
|
| 2270 |
full_config = FullMolmoConfig(
|
|
|
|
|
|
|
|
|
|
| 2271 |
rope_impl="llama",
|
| 2272 |
vocab_size=config.vocab_size,
|
| 2273 |
max_sequence_length=config.max_position_embeddings,
|
| 2274 |
qkv_bias=config.qkv_bias,
|
|
|
|
| 2275 |
embedding_size=config.embedding_size,
|
| 2276 |
attention_type="sdpa",
|
| 2277 |
embedding_dropout=0,
|
|
@@ -2287,9 +2294,9 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2287 |
additional_vocab_size=128,
|
| 2288 |
n_heads=config.num_attention_heads,
|
| 2289 |
n_kv_heads=config.num_key_value_heads,
|
| 2290 |
-
rope_theta=
|
| 2291 |
-
layer_norm_eps=
|
| 2292 |
-
layer_norm_type=
|
| 2293 |
pad_tokenizer=True,
|
| 2294 |
vit_layers=[-2, -9],
|
| 2295 |
vision_backbone=VisionBackboneConfig(
|
|
@@ -2312,7 +2319,7 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2312 |
initializer_range=0.02,
|
| 2313 |
)
|
| 2314 |
)
|
| 2315 |
-
self.model =
|
| 2316 |
else:
|
| 2317 |
self.model = model
|
| 2318 |
|
|
@@ -2345,7 +2352,7 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2345 |
use_cache = self.config.use_cache
|
| 2346 |
|
| 2347 |
if output_attentions:
|
| 2348 |
-
raise ValueError("output_attentions is not yet supported in
|
| 2349 |
|
| 2350 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 2351 |
|
|
@@ -2524,16 +2531,6 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2524 |
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
|
| 2525 |
return model_kwargs
|
| 2526 |
|
| 2527 |
-
# TODO: these are required to make the implementation complete.
|
| 2528 |
-
# def resize_position_embeddings(self, new_num_position_embeddings: int):
|
| 2529 |
-
# pass
|
| 2530 |
-
#
|
| 2531 |
-
# def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
|
| 2532 |
-
# pass
|
| 2533 |
-
#
|
| 2534 |
-
# def _reorder_cache(self, past_key_values, beam_idx):
|
| 2535 |
-
# pass
|
| 2536 |
-
|
| 2537 |
def get_input_embeddings(self) -> torch.nn.Module:
|
| 2538 |
return self.model.transformer.wte
|
| 2539 |
|
|
@@ -2555,11 +2552,13 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2555 |
def tie_weights(self):
|
| 2556 |
"""
|
| 2557 |
This function is intentionally left as a no-op.
|
|
|
|
| 2558 |
Weight tying is handled as follows:
|
| 2559 |
- When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
|
| 2560 |
See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
|
| 2561 |
- When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
|
| 2562 |
See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
|
|
|
|
| 2563 |
Therefore, there is no need to explicitly tie the weights in this function.
|
| 2564 |
"""
|
| 2565 |
pass
|
|
@@ -2569,7 +2568,9 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2569 |
) -> torch.nn.Embedding:
|
| 2570 |
"""
|
| 2571 |
Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
|
|
|
|
| 2572 |
Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
|
|
|
| 2573 |
Arguments:
|
| 2574 |
new_num_tokens (`int`, *optional*):
|
| 2575 |
The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
|
|
@@ -2578,12 +2579,15 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2578 |
pad_to_multiple_of (`int`, *optional*):
|
| 2579 |
If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
|
| 2580 |
`None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
|
|
|
|
| 2581 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
| 2582 |
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
|
| 2583 |
details about this, or help on choosing the correct value for resizing, refer to this guide:
|
| 2584 |
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
|
|
|
|
| 2585 |
Return:
|
| 2586 |
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
|
|
|
|
| 2587 |
Note:
|
| 2588 |
This method differs from the base class implementation by resizing the `embedding_size` attribute of the
|
| 2589 |
model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
|
|
@@ -2614,4 +2618,4 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
| 2614 |
|
| 2615 |
|
| 2616 |
# Always register for multi-modal features
|
| 2617 |
-
AutoModelForCausalLM.register(MolmoConfig,
|
|
|
|
| 77 |
x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
|
| 78 |
|
| 79 |
|
| 80 |
+
class MolmoConfigurationError(Exception):
|
| 81 |
pass
|
| 82 |
|
| 83 |
|
|
|
|
| 189 |
return q_.type_as(q), k_.type_as(k)
|
| 190 |
|
| 191 |
|
| 192 |
+
class MolmoBlock(nn.Module):
|
| 193 |
"""
|
| 194 |
A base class for transformer block implementations.
|
| 195 |
"""
|
|
|
|
| 420 |
@classmethod
|
| 421 |
def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
|
| 422 |
if config.block_type == "sequential":
|
| 423 |
+
return MolmoSequentialBlock(layer_id, config, cache)
|
| 424 |
elif config.block_type == "llama":
|
| 425 |
return OLMoLlamaBlock(layer_id, config, cache)
|
| 426 |
else:
|
| 427 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
| 428 |
|
| 429 |
|
| 430 |
+
class OLMoLlamaBlock(MolmoBlock):
|
| 431 |
"""
|
| 432 |
This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
| 433 |
+
(plus another skip connection). This block is similar to `MolmoSequentialBlock`
|
| 434 |
but some operations have slightly different implementations to imitate the
|
| 435 |
behavior of Llama.
|
| 436 |
"""
|
|
|
|
| 598 |
return x, cache
|
| 599 |
|
| 600 |
|
| 601 |
+
class MolmoSequentialBlock(MolmoBlock):
|
| 602 |
"""
|
| 603 |
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
| 604 |
(plus another skip connection).
|
|
|
|
| 825 |
class FullMolmoConfig:
|
| 826 |
d_model: int = 768
|
| 827 |
n_heads: int = 12
|
|
|
|
| 828 |
n_kv_heads: Optional[int] = None
|
| 829 |
qkv_bias: bool = False
|
| 830 |
clip_qkv: Optional[float] = None
|
|
|
|
| 907 |
if self.n_kv_heads == n_kv_heads_should_be:
|
| 908 |
return n_kv_heads_should_be
|
| 909 |
else:
|
| 910 |
+
raise MolmoConfigurationError(
|
| 911 |
"You can't set `multi_query_attention` and `n_kv_heads` at the same time."
|
| 912 |
)
|
| 913 |
|
|
|
|
| 1896 |
return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
|
| 1897 |
|
| 1898 |
|
| 1899 |
+
class Molmo(nn.Module):
|
| 1900 |
def __init__(self, config: FullMolmoConfig, init_params: bool = True):
|
| 1901 |
super().__init__()
|
| 1902 |
self.config = config
|
|
|
|
| 1905 |
# Validate config.
|
| 1906 |
if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
|
| 1907 |
if self.config.embedding_size < self.config.vocab_size:
|
| 1908 |
+
raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
|
| 1909 |
elif self.config.embedding_size % 128 != 0:
|
| 1910 |
import warnings
|
| 1911 |
|
|
|
|
| 1938 |
)
|
| 1939 |
)
|
| 1940 |
|
| 1941 |
+
blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
|
| 1942 |
if self.config.block_group_size > 1:
|
| 1943 |
raise NotImplementedError()
|
| 1944 |
else:
|
|
|
|
| 2017 |
which input IDs are masked. A `1` value in the mask means that
|
| 2018 |
the corresponding input ID should *not* be ignored. A `0` means
|
| 2019 |
that the corresponding input ID is masked.
|
| 2020 |
+
|
| 2021 |
This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
|
| 2022 |
library.
|
| 2023 |
:param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
|
| 2024 |
`(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
|
| 2025 |
to introduce causal or other biases.
|
| 2026 |
+
|
| 2027 |
If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
|
| 2028 |
indicates that the i-th element in the sequence is allowed to attend to the j-th
|
| 2029 |
element in the sequence.
|
| 2030 |
+
|
| 2031 |
If the tensor is a float tensor, it will just be added to the attention
|
| 2032 |
scores before the softmax.
|
| 2033 |
+
|
| 2034 |
The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
|
| 2035 |
:param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
|
| 2036 |
the response mask. A `1` value in the mask means that the corresponding token
|
|
|
|
| 2261 |
return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
|
| 2262 |
|
| 2263 |
|
| 2264 |
+
class MolmoForCausalLM(PreTrainedModel):
|
| 2265 |
config_class = MolmoConfig
|
| 2266 |
base_model_prefix = "model"
|
| 2267 |
+
_no_split_modules = ["MolmoBlock"]
|
| 2268 |
|
| 2269 |
+
def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
|
| 2270 |
super().__init__(config)
|
| 2271 |
|
| 2272 |
if not model:
|
| 2273 |
full_config = FullMolmoConfig(
|
| 2274 |
+
attention_layer_norm=config.attention_layer_norm,
|
| 2275 |
+
image_padding_embed="pad_and_partial_pad",
|
| 2276 |
+
image_pooling_2d="attention-meanq",
|
| 2277 |
rope_impl="llama",
|
| 2278 |
vocab_size=config.vocab_size,
|
| 2279 |
max_sequence_length=config.max_position_embeddings,
|
| 2280 |
qkv_bias=config.qkv_bias,
|
| 2281 |
+
norm_after=config.norm_after,
|
| 2282 |
embedding_size=config.embedding_size,
|
| 2283 |
attention_type="sdpa",
|
| 2284 |
embedding_dropout=0,
|
|
|
|
| 2294 |
additional_vocab_size=128,
|
| 2295 |
n_heads=config.num_attention_heads,
|
| 2296 |
n_kv_heads=config.num_key_value_heads,
|
| 2297 |
+
rope_theta=config.rope_theta,
|
| 2298 |
+
layer_norm_eps=config.layer_norm_eps,
|
| 2299 |
+
layer_norm_type=config.layer_norm_type,
|
| 2300 |
pad_tokenizer=True,
|
| 2301 |
vit_layers=[-2, -9],
|
| 2302 |
vision_backbone=VisionBackboneConfig(
|
|
|
|
| 2319 |
initializer_range=0.02,
|
| 2320 |
)
|
| 2321 |
)
|
| 2322 |
+
self.model = Molmo(full_config, init_params=init_params)
|
| 2323 |
else:
|
| 2324 |
self.model = model
|
| 2325 |
|
|
|
|
| 2352 |
use_cache = self.config.use_cache
|
| 2353 |
|
| 2354 |
if output_attentions:
|
| 2355 |
+
raise ValueError("output_attentions is not yet supported in Molmo")
|
| 2356 |
|
| 2357 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 2358 |
|
|
|
|
| 2531 |
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
|
| 2532 |
return model_kwargs
|
| 2533 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2534 |
def get_input_embeddings(self) -> torch.nn.Module:
|
| 2535 |
return self.model.transformer.wte
|
| 2536 |
|
|
|
|
| 2552 |
def tie_weights(self):
|
| 2553 |
"""
|
| 2554 |
This function is intentionally left as a no-op.
|
| 2555 |
+
|
| 2556 |
Weight tying is handled as follows:
|
| 2557 |
- When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
|
| 2558 |
See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
|
| 2559 |
- When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
|
| 2560 |
See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
|
| 2561 |
+
|
| 2562 |
Therefore, there is no need to explicitly tie the weights in this function.
|
| 2563 |
"""
|
| 2564 |
pass
|
|
|
|
| 2568 |
) -> torch.nn.Embedding:
|
| 2569 |
"""
|
| 2570 |
Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
|
| 2571 |
+
|
| 2572 |
Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
| 2573 |
+
|
| 2574 |
Arguments:
|
| 2575 |
new_num_tokens (`int`, *optional*):
|
| 2576 |
The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
|
|
|
|
| 2579 |
pad_to_multiple_of (`int`, *optional*):
|
| 2580 |
If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
|
| 2581 |
`None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
|
| 2582 |
+
|
| 2583 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
| 2584 |
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
|
| 2585 |
details about this, or help on choosing the correct value for resizing, refer to this guide:
|
| 2586 |
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
|
| 2587 |
+
|
| 2588 |
Return:
|
| 2589 |
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
|
| 2590 |
+
|
| 2591 |
Note:
|
| 2592 |
This method differs from the base class implementation by resizing the `embedding_size` attribute of the
|
| 2593 |
model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
|
|
|
|
| 2618 |
|
| 2619 |
|
| 2620 |
# Always register for multi-modal features
|
| 2621 |
+
AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)
|
preprocessing_molmo.py
CHANGED
|
@@ -2,9 +2,7 @@
|
|
| 2 |
Processor class for Molmo.
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from typing import
|
| 6 |
-
|
| 7 |
-
from transformers.utils.constants import OPENAI_CLIP_STD, OPENAI_CLIP_MEAN
|
| 8 |
|
| 9 |
try:
|
| 10 |
from typing import Unpack
|
|
|
|
| 2 |
Processor class for Molmo.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from typing import Optional
|
|
|
|
|
|
|
| 6 |
|
| 7 |
try:
|
| 8 |
from typing import Unpack
|