Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

Luigi commited on 29 days ago

Commit

de766da

1 Parent(s): e3e334f

Adjust duration estimation for H200 performance - reduce conservative estimates

Files changed (1) hide show

app.py CHANGED Viewed

@@ -362,10 +362,11 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
-    base_duration = 60 if not use_aot else 120  # Shorter base for non-AOT
-    token_duration = max_tokens * 0.1
-    search_duration = 30 if enable_search else 0
-    aot_compilation_buffer = 60 if use_aot else 0  # Extra time for compilation
     return base_duration + token_duration + search_duration + aot_compilation_buffer

     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
+    # Adjusted for H200 performance: faster inference, quicker compilation
+    base_duration = 20 if not use_aot else 40  # Reduced base times
+    token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
+    search_duration = 10 if enable_search else 0  # Reduced search time
+    aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
     return base_duration + token_duration + search_duration + aot_compilation_buffer