Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

Tonic commited on Jul 30

Commit

3448aad

1 Parent(s): 37641fb

removes flash attention from app

Files changed (1) hide show

app.py CHANGED Viewed

@@ -119,7 +119,7 @@ def load_model():
             "torch_dtype": torch.bfloat16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
-            "attn_implementation": "flash_attention_2" if DEVICE == "cuda" else "eager"
         }
         logger.info(f"Model loading parameters: {model_kwargs}")

             "torch_dtype": torch.bfloat16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
+        #   "attn_implementation": "flash_attention_2" if DEVICE == "cuda" else "eager"
         }
         logger.info(f"Model loading parameters: {model_kwargs}")