Spaces:
Running
Running
Adjust duration estimation for H200 performance - reduce conservative estimates
Browse files
app.py
CHANGED
|
@@ -362,10 +362,11 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
|
|
| 362 |
# Only use AOT for models >= 2B parameters
|
| 363 |
use_aot = model_size >= 2
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
| 369 |
|
| 370 |
return base_duration + token_duration + search_duration + aot_compilation_buffer
|
| 371 |
|
|
|
|
| 362 |
# Only use AOT for models >= 2B parameters
|
| 363 |
use_aot = model_size >= 2
|
| 364 |
|
| 365 |
+
# Adjusted for H200 performance: faster inference, quicker compilation
|
| 366 |
+
base_duration = 20 if not use_aot else 40 # Reduced base times
|
| 367 |
+
token_duration = max_tokens * 0.005 # ~200 tokens/second average on H200
|
| 368 |
+
search_duration = 10 if enable_search else 0 # Reduced search time
|
| 369 |
+
aot_compilation_buffer = 20 if use_aot else 0 # Faster compilation on H200
|
| 370 |
|
| 371 |
return base_duration + token_duration + search_duration + aot_compilation_buffer
|
| 372 |
|