Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

4500f92

1 Parent(s): a7866ff

Make AOT compilation conditional for models >= 2B parameters to optimize free tier usage

Browse files

Files changed (1) hide show

app.py +99 -31

app.py CHANGED Viewed

@@ -330,10 +330,44 @@ def format_conversation(history, system_prompt, tokenizer):
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
-    base_duration = 120  # Increased for AOT compilation
-    token_duration = max_tokens * 0.1  # Estimate 0.1 seconds per token
     search_duration = 30 if enable_search else 0
-    return base_duration + token_duration + search_duration
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
@@ -419,34 +453,68 @@ def chat_response(user_msg, chat_history, system_prompt,
         pipe = load_pipeline(model_name)
-        # AOT compilation for performance optimization
-        try:
-            with spaces.aoti_capture(pipe.model) as call:
-                pipe("Hello world", max_new_tokens=5, do_sample=False, pad_token_id=pipe.tokenizer.eos_token_id)
-            # Define dynamic shapes for variable sequence lengths
-            seq_dim = torch.export.Dim('seq', min=1, max=4096)
-            dynamic_shapes = tree_map(lambda v: None, call.kwargs)
-            # Set dynamic dimensions for common inputs
-            if 'input_ids' in call.kwargs:
-                dynamic_shapes['input_ids'] = {1: seq_dim}
-            if 'attention_mask' in call.kwargs:
-                dynamic_shapes['attention_mask'] = {1: seq_dim}
-            if 'position_ids' in call.kwargs:
-                dynamic_shapes['position_ids'] = {1: seq_dim}
-            exported = torch.export.export(
-                pipe.model,
-                args=call.args,
-                kwargs=call.kwargs,
-                dynamic_shapes=dynamic_shapes
-            )
-            compiled = spaces.aoti_compile(exported)
-            spaces.aoti_apply(compiled, pipe.model)
-            print(f"AOT compilation successful for {model_name}")
-        except Exception as e:
-            print(f"AOT compilation failed for {model_name}: {e}")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"

         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
+    # Estimate model size (rough approximation based on model name)
+    model_size = 0
+    if '30B' in model_name or '32B' in model_name:
+        model_size = 30
+    elif '20B' in model_name:
+        model_size = 20
+    elif '15B' in model_name or '14B' in model_name:
+        model_size = 15
+    elif '4B' in model_name or '3B' in model_name:
+        model_size = 4
+    elif '2B' in model_name or '1.7B' in model_name:
+        model_size = 2
+    elif '1.5B' in model_name or '1.2B' in model_name or '1.1B' in model_name:
+        model_size = 1.5
+    elif '1B' in model_name:
+        model_size = 1
+    elif '700M' in model_name or '600M' in model_name:
+        model_size = 0.7
+    elif '500M' in model_name:
+        model_size = 0.5
+    elif '360M' in model_name or '350M' in model_name:
+        model_size = 0.35
+    elif '270M' in model_name:
+        model_size = 0.27
+    elif '135M' in model_name:
+        model_size = 0.135
+    else:
+        model_size = 4  # default
+    # Only use AOT for models >= 2B parameters
+    use_aot = model_size >= 2
+    base_duration = 60 if not use_aot else 120  # Shorter base for non-AOT
+    token_duration = max_tokens * 0.1
     search_duration = 30 if enable_search else 0
+    aot_compilation_buffer = 60 if use_aot else 0  # Extra time for compilation
+    return base_duration + token_duration + search_duration + aot_compilation_buffer
 @spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
         pipe = load_pipeline(model_name)
+        # AOT compilation for performance optimization (only for larger models)
+        # Estimate model size
+        model_size = 0
+        if '30B' in model_name or '32B' in model_name:
+            model_size = 30
+        elif '20B' in model_name:
+            model_size = 20
+        elif '15B' in model_name or '14B' in model_name:
+            model_size = 15
+        elif '4B' in model_name or '3B' in model_name:
+            model_size = 4
+        elif '2B' in model_name or '1.7B' in model_name:
+            model_size = 2
+        elif '1.5B' in model_name or '1.2B' in model_name or '1.1B' in model_name:
+            model_size = 1.5
+        elif '1B' in model_name:
+            model_size = 1
+        elif '700M' in model_name or '600M' in model_name:
+            model_size = 0.7
+        elif '500M' in model_name:
+            model_size = 0.5
+        elif '360M' in model_name or '350M' in model_name:
+            model_size = 0.35
+        elif '270M' in model_name:
+            model_size = 0.27
+        elif '135M' in model_name:
+            model_size = 0.135
+        else:
+            model_size = 4  # default
+        use_aot = model_size >= 2  # Only compile models >= 2B parameters
+        if use_aot:
+            try:
+                with spaces.aoti_capture(pipe.model) as call:
+                    pipe("Hello world", max_new_tokens=5, do_sample=False, pad_token_id=pipe.tokenizer.eos_token_id)
+                # Define dynamic shapes for variable sequence lengths
+                seq_dim = torch.export.Dim('seq', min=1, max=4096)
+                dynamic_shapes = tree_map(lambda v: None, call.kwargs)
+                # Set dynamic dimensions for common inputs
+                if 'input_ids' in call.kwargs:
+                    dynamic_shapes['input_ids'] = {1: seq_dim}
+                if 'attention_mask' in call.kwargs:
+                    dynamic_shapes['attention_mask'] = {1: seq_dim}
+                if 'position_ids' in call.kwargs:
+                    dynamic_shapes['position_ids'] = {1: seq_dim}
+                exported = torch.export.export(
+                    pipe.model,
+                    args=call.args,
+                    kwargs=call.kwargs,
+                    dynamic_shapes=dynamic_shapes
+                )
+                compiled = spaces.aoti_compile(exported)
+                spaces.aoti_apply(compiled, pipe.model)
+                print(f"AOT compilation successful for {model_name}")
+            except Exception as e:
+                print(f"AOT compilation failed for {model_name}: {e}")
+        else:
+            print(f"Skipping AOT compilation for small model {model_name}")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"