Luigi commited on
Commit
de766da
·
1 Parent(s): e3e334f

Adjust duration estimation for H200 performance - reduce conservative estimates

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -362,10 +362,11 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
362
  # Only use AOT for models >= 2B parameters
363
  use_aot = model_size >= 2
364
 
365
- base_duration = 60 if not use_aot else 120 # Shorter base for non-AOT
366
- token_duration = max_tokens * 0.1
367
- search_duration = 30 if enable_search else 0
368
- aot_compilation_buffer = 60 if use_aot else 0 # Extra time for compilation
 
369
 
370
  return base_duration + token_duration + search_duration + aot_compilation_buffer
371
 
 
362
  # Only use AOT for models >= 2B parameters
363
  use_aot = model_size >= 2
364
 
365
+ # Adjusted for H200 performance: faster inference, quicker compilation
366
+ base_duration = 20 if not use_aot else 40 # Reduced base times
367
+ token_duration = max_tokens * 0.005 # ~200 tokens/second average on H200
368
+ search_duration = 10 if enable_search else 0 # Reduced search time
369
+ aot_compilation_buffer = 20 if use_aot else 0 # Faster compilation on H200
370
 
371
  return base_duration + token_duration + search_duration + aot_compilation_buffer
372