Spaces:

openbmb
/

MiniCPM4.1-8B-Demo

Build error

App Files Files Community

JqzAugUST commited on Sep 19

Commit

075d36d

verified ·

1 Parent(s): 7149349

Update eagle/model/modeling_minicpm_kv.py

Browse files

Files changed (1) hide show

eagle/model/modeling_minicpm_kv.py +9 -14

eagle/model/modeling_minicpm_kv.py CHANGED Viewed

@@ -1575,19 +1575,11 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
         return attn_output, None, past_key_value
-# <mod> dev: support sdpa only
-# MINICPM_ATTENTION_CLASSES = {
-#     'eager': MiniCPMAttention,
-#     'flash_attention_2': MiniCPMFlashAttention2,
-#     'sdpa': MiniCPMSdpaAttention,
-# }
-# <before-after-mod> -------------------------------------------------
 MINICPM_ATTENTION_CLASSES = {
     'eager': MiniCPMAttention,
-    'flash_attention_2': MiniCPMAttention,
-    'sdpa': MiniCPMAttention,
 }
-# </mod>
 class MiniCPMDecoderLayer(nn.Module):
     def __init__(self, config: MiniCPMConfig, layer_idx: int):
@@ -1596,7 +1588,11 @@ class MiniCPMDecoderLayer(nn.Module):
         if config.sparse_config is not None and torch.cuda.is_available():
             self.self_attn = MiniCPMInfLLMv2Attention(config=config, layer_idx=layer_idx)
         else:
-            self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
         self.mlp = MiniCPMMLP(config)
         self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1805,7 +1801,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         # self._use_sdpa = config._attn_implementation == 'sdpa'
         # self._use_flash_attention_2 = config._attn_implementation == 'flash_attention_2'
         # <before-after-mod> -------------------------------------------------
-        self._use_sdpa, self._use_flash_attention_2 = True, False
         # </mod>
         self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1963,8 +1959,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         #         attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
         #     )
         # <before-after-mod> -------------------------------------------------
-        if not self._use_sdpa:
-            raise NotImplementedError("JQZ 250917 | Currently support sdpa **ONLY**, further impl for flash attention or infllm attention not finished yet.")
         # # below is copied from modeling_llama_kv.py, Line 1110
         if attention_mask is None:
             attention_mask = torch.ones(

         return attn_output, None, past_key_value
 MINICPM_ATTENTION_CLASSES = {
     'eager': MiniCPMAttention,
+    'flash_attention_2': MiniCPMFlashAttention2,
+    'sdpa': MiniCPMSdpaAttention,
 }
 class MiniCPMDecoderLayer(nn.Module):
     def __init__(self, config: MiniCPMConfig, layer_idx: int):
         if config.sparse_config is not None and torch.cuda.is_available():
             self.self_attn = MiniCPMInfLLMv2Attention(config=config, layer_idx=layer_idx)
         else:
+            # <mod>
+            # self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+            # <before-after-mod> -------------------------------------------------
+            self.self_attn = MINICPM_ATTENTION_CLASSES["eager"](config=config, layer_idx=layer_idx)
+            # </mod>
         self.mlp = MiniCPMMLP(config)
         self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         # self._use_sdpa = config._attn_implementation == 'sdpa'
         # self._use_flash_attention_2 = config._attn_implementation == 'flash_attention_2'
         # <before-after-mod> -------------------------------------------------
+        self._use_sdpa, self._use_flash_attention_2 = False, False
         # </mod>
         self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         #         attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
         #     )
         # <before-after-mod> -------------------------------------------------
+        # For HF space demo, use MiniCPMAttention **ONLY**
         # # below is copied from modeling_llama_kv.py, Line 1110
         if attention_mask is None:
             attention_mask = torch.ones(