Spaces:
Sleeping
Sleeping
removed flash attention
Browse files- models/micro_llama.py +1 -1
- models/micro_moe_llama.py +1 -1
- models/micro_olmo.py +1 -1
- requirements.txt +1 -2
- router_backend.py +1 -1
models/micro_llama.py
CHANGED
|
@@ -249,7 +249,7 @@ class MiCRoLlama(LlamaPreTrainedModel, GenerationMixin):
|
|
| 249 |
self.config: MiCRoLlamaConfig = config
|
| 250 |
self.config.torch_dtype = torch.bfloat16
|
| 251 |
self.config.use_bfloat16 = True
|
| 252 |
-
self.config._attn_implementation = "
|
| 253 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 254 |
self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
|
| 255 |
self.config.loss_type = "ForCausalLMLoss"
|
|
|
|
| 249 |
self.config: MiCRoLlamaConfig = config
|
| 250 |
self.config.torch_dtype = torch.bfloat16
|
| 251 |
self.config.use_bfloat16 = True
|
| 252 |
+
self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
|
| 253 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 254 |
self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
|
| 255 |
self.config.loss_type = "ForCausalLMLoss"
|
models/micro_moe_llama.py
CHANGED
|
@@ -275,7 +275,7 @@ class MiCRoLlamaMoE(LlamaPreTrainedModel, GenerationMixin):
|
|
| 275 |
self.config: MiCRoLlamaMoEConfig = config
|
| 276 |
self.config.torch_dtype = torch.bfloat16
|
| 277 |
self.config.use_bfloat16 = True
|
| 278 |
-
self.config._attn_implementation = "
|
| 279 |
self.config.use_cache = True
|
| 280 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 281 |
self.config.num_hidden_layers = self.config.num_hidden_layers
|
|
|
|
| 275 |
self.config: MiCRoLlamaMoEConfig = config
|
| 276 |
self.config.torch_dtype = torch.bfloat16
|
| 277 |
self.config.use_bfloat16 = True
|
| 278 |
+
self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
|
| 279 |
self.config.use_cache = True
|
| 280 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 281 |
self.config.num_hidden_layers = self.config.num_hidden_layers
|
models/micro_olmo.py
CHANGED
|
@@ -191,7 +191,7 @@ class MiCRoOLMo(Olmo2PreTrainedModel, GenerationMixin):
|
|
| 191 |
self.config: Olmo2Config = config
|
| 192 |
self.config.torch_dtype = torch.bfloat16
|
| 193 |
self.config.use_bfloat16 = True
|
| 194 |
-
self.config._attn_implementation = "
|
| 195 |
self.config.use_cache = True
|
| 196 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 197 |
self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
|
|
|
|
| 191 |
self.config: Olmo2Config = config
|
| 192 |
self.config.torch_dtype = torch.bfloat16
|
| 193 |
self.config.use_bfloat16 = True
|
| 194 |
+
self.config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
|
| 195 |
self.config.use_cache = True
|
| 196 |
self.config.backbone_num_layers = self.config.num_hidden_layers
|
| 197 |
self.config.num_hidden_layers = self.config.num_hidden_layers * run_config["num-experts"]
|
requirements.txt
CHANGED
|
@@ -3,5 +3,4 @@ plotly>=5.22.0
|
|
| 3 |
pandas>=2.2.0
|
| 4 |
torch==2.7.1
|
| 5 |
transformers==4.53.2
|
| 6 |
-
numpy==2.3.4
|
| 7 |
-
flash-attn
|
|
|
|
| 3 |
pandas>=2.2.0
|
| 4 |
torch==2.7.1
|
| 5 |
transformers==4.53.2
|
| 6 |
+
numpy==2.3.4
|
|
|
router_backend.py
CHANGED
|
@@ -195,7 +195,7 @@ def build_model(model_id: str, hf_token: str, use_cache: bool = True):
|
|
| 195 |
|
| 196 |
model_config.torch_dtype = torch.bfloat16
|
| 197 |
model_config.use_bfloat16 = True
|
| 198 |
-
model_config._attn_implementation = "flash_attention_2
|
| 199 |
model_config.use_cache = use_cache
|
| 200 |
model_config.ablate = []
|
| 201 |
|
|
|
|
| 195 |
|
| 196 |
model_config.torch_dtype = torch.bfloat16
|
| 197 |
model_config.use_bfloat16 = True
|
| 198 |
+
model_config._attn_implementation = "eager" # {sdpa, flash_attention_2, eager}
|
| 199 |
model_config.use_cache = use_cache
|
| 200 |
model_config.ablate = []
|
| 201 |
|