Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on 29 days ago

Commit

e3e334f

1 Parent(s): 4500f92

Use actual parameter count for AOT decision instead of string matching

Browse files

Files changed (1) hide show

app.py +6 -33

app.py CHANGED Viewed

@@ -453,37 +453,10 @@ def chat_response(user_msg, chat_history, system_prompt,
         pipe = load_pipeline(model_name)
-        # AOT compilation for performance optimization (only for larger models)
-        # Estimate model size
-        model_size = 0
-        if '30B' in model_name or '32B' in model_name:
-            model_size = 30
-        elif '20B' in model_name:
-            model_size = 20
-        elif '15B' in model_name or '14B' in model_name:
-            model_size = 15
-        elif '4B' in model_name or '3B' in model_name:
-            model_size = 4
-        elif '2B' in model_name or '1.7B' in model_name:
-            model_size = 2
-        elif '1.5B' in model_name or '1.2B' in model_name or '1.1B' in model_name:
-            model_size = 1.5
-        elif '1B' in model_name:
-            model_size = 1
-        elif '700M' in model_name or '600M' in model_name:
-            model_size = 0.7
-        elif '500M' in model_name:
-            model_size = 0.5
-        elif '360M' in model_name or '350M' in model_name:
-            model_size = 0.35
-        elif '270M' in model_name:
-            model_size = 0.27
-        elif '135M' in model_name:
-            model_size = 0.135
-        else:
-            model_size = 4  # default
-        use_aot = model_size >= 2  # Only compile models >= 2B parameters
         if use_aot:
             try:
@@ -510,11 +483,11 @@ def chat_response(user_msg, chat_history, system_prompt,
                 )
                 compiled = spaces.aoti_compile(exported)
                 spaces.aoti_apply(compiled, pipe.model)
-                print(f"AOT compilation successful for {model_name}")
             except Exception as e:
                 print(f"AOT compilation failed for {model_name}: {e}")
         else:
-            print(f"Skipping AOT compilation for small model {model_name}")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"

         pipe = load_pipeline(model_name)
+        # Determine actual model size for AOT decision
+        actual_params = sum(p.numel() for p in pipe.model.parameters())
+        model_size_b = actual_params / 1e9  # Convert to billions
+        use_aot = model_size_b >= 2  # Only compile models >= 2B parameters
         if use_aot:
             try:
                 )
                 compiled = spaces.aoti_compile(exported)
                 spaces.aoti_apply(compiled, pipe.model)
+                print(f"AOT compilation successful for {model_name} ({model_size_b:.1f}B parameters)")
             except Exception as e:
                 print(f"AOT compilation failed for {model_name}: {e}")
         else:
+            print(f"Skipping AOT compilation for small model {model_name} ({model_size_b:.1f}B parameters)")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"