Spaces:

bkhmsi
/

cognitive-reasoners

Sleeping

bkhmsi commited on 15 days ago

Commit

9a48e97

1 Parent(s): 4c963c4

removed flash attention

Files changed (5) hide show

models/micro_llama.py CHANGED Viewed

@@ -249,7 +249,7 @@ class MiCRoLlama(LlamaPreTrainedModel, GenerationMixin):
         self.config: MiCRoLlamaConfig = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
-        self.config._attn_implementation = "flash_attention_2" # {sdpa, flash_attention_2, eager}
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
         self.config.loss_type = "ForCausalLMLoss"

         self.config: MiCRoLlamaConfig = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
+        self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
         self.config.loss_type = "ForCausalLMLoss"

models/micro_moe_llama.py CHANGED Viewed

@@ -275,7 +275,7 @@ class MiCRoLlamaMoE(LlamaPreTrainedModel, GenerationMixin):
         self.config: MiCRoLlamaMoEConfig = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
-        self.config._attn_implementation = "flash_attention_2" # {sdpa, flash_attention_2, eager}
         self.config.use_cache = True
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers

         self.config: MiCRoLlamaMoEConfig = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
+        self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
         self.config.use_cache = True
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers

models/micro_olmo.py CHANGED Viewed

@@ -191,7 +191,7 @@ class MiCRoOLMo(Olmo2PreTrainedModel, GenerationMixin):
         self.config: Olmo2Config = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
-        self.config._attn_implementation = "flash_attention_2" # {sdpa, flash_attention_2, eager}
         self.config.use_cache = True
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]

         self.config: Olmo2Config = config
         self.config.torch_dtype = torch.bfloat16
         self.config.use_bfloat16 = True
+        self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
         self.config.use_cache = True
         self.config.backbone_num_layers = self.config.num_hidden_layers
         self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]

requirements.txt CHANGED Viewed

@@ -3,5 +3,4 @@ plotly>=5.22.0
 pandas>=2.2.0
 torch==2.7.1
 transformers==4.53.2
-numpy==2.3.4
-flash-attn

 pandas>=2.2.0
 torch==2.7.1
 transformers==4.53.2
+numpy==2.3.4

router_backend.py CHANGED Viewed

@@ -195,7 +195,7 @@ def build_model(model_id: str, hf_token: str, use_cache: bool = True):
     model_config.torch_dtype = torch.bfloat16
     model_config.use_bfloat16 = True
-    model_config._attn_implementation = "flash_attention_2"
     model_config.use_cache = use_cache
     model_config.ablate = []

     model_config.torch_dtype = torch.bfloat16
     model_config.use_bfloat16 = True
+    model_config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
     model_config.use_cache = use_cache
     model_config.ablate = []