Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

Tonic commited on Jul 30

Commit

fbcc018

1 Parent(s): d749fc8

makes modifications for zerogpu spaces

Files changed (2) hide show

app.py CHANGED Viewed

@@ -116,7 +116,7 @@ def load_model():
         # Load the full fine-tuned model with optimized settings
         model_kwargs = {
             "device_map": "auto" if DEVICE == "cuda" else "cpu",
-            "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "attn_implementation": "flash_attention_2" if DEVICE == "cuda" else "eager"

         # Load the full fine-tuned model with optimized settings
         model_kwargs = {
             "device_map": "auto" if DEVICE == "cuda" else "cpu",
+            "torch_dtype": torch.bfloat16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "attn_implementation": "flash_attention_2" if DEVICE == "cuda" else "eager"

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio>=5.38.2
 torch>=2.0.0
 transformers>=4.54.0
@@ -9,4 +10,3 @@ pyyaml>=6.0
 psutil>=5.9.0
 tqdm>=4.64.0
 requests>=2.31.0
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch1.12cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch1.12cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 gradio>=5.38.2
 torch>=2.0.0
 transformers>=4.54.0
 psutil>=5.9.0
 tqdm>=4.64.0
 requests>=2.31.0