Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

Luigi commited on 30 days ago

Commit

fc989b4

1 Parent(s): ab92e0d

Add dynamic GPU time estimate indicator to UI

- Shows estimated GPU seconds for current inference settings
- Updates in real-time when model, search settings, or max tokens change
- Displays model size, AOT status, and search status
- Helps users manage precious ZeroGPU time budget effectively

Files changed (1) hide show

app.py +48 -0

app.py CHANGED Viewed

@@ -614,6 +614,28 @@ def cancel_generation():
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
 # ------------------------------
 # Gradio UI
 # ------------------------------
@@ -625,6 +647,12 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value="Qwen3-1.7B")
             search_chk = gr.Checkbox(label="Enable Web Search", value=False)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
             gr.Markdown("### Generation Parameters")
             max_tok = gr.Slider(64, 16384, value=1024, step=32, label="Max Tokens")
             temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
@@ -642,6 +670,26 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
             dbg = gr.Markdown()
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
     cnl.click(fn=cancel_generation, outputs=dbg)

 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
+def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout):
+    """Calculate and format the estimated GPU duration for current settings."""
+    try:
+        # Create dummy values for the other parameters that get_duration expects
+        dummy_msg = ""
+        dummy_history = []
+        dummy_system_prompt = ""
+        duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
+                              enable_search, max_results, max_chars, model_name,
+                              max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
+        model_size = MODELS[model_name].get("params_b", 4.0)
+        use_aot = model_size >= 2
+        return f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n" \
+               f"📊 **Model Size:** {model_size:.1f}B parameters\n" \
+               f"⚡ **AOT Compilation:** {'Enabled' if use_aot else 'Disabled'}\n" \
+               f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}"
+    except Exception as e:
+        return f"⚠️ Error calculating estimate: {e}"
 # ------------------------------
 # Gradio UI
 # ------------------------------
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value="Qwen3-1.7B")
             search_chk = gr.Checkbox(label="Enable Web Search", value=False)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
+            # GPU Time Estimate Display
+            duration_display = gr.Markdown(value=update_duration_estimate(
+                "Qwen3-1.7B", False, 4, 50, 1024, 5.0
+            ))
             gr.Markdown("### Generation Parameters")
             max_tok = gr.Slider(64, 16384, value=1024, step=32, label="Max Tokens")
             temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
             dbg = gr.Markdown()
+    # Update duration estimate when relevant inputs change
+    model_dd.change(fn=update_duration_estimate,
+                    inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+                    outputs=duration_display)
+    search_chk.change(fn=update_duration_estimate,
+                      inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+                      outputs=duration_display)
+    max_tok.change(fn=update_duration_estimate,
+                   inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+                   outputs=duration_display)
+    mr.change(fn=update_duration_estimate,
+              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+              outputs=duration_display)
+    mc.change(fn=update_duration_estimate,
+              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+              outputs=duration_display)
+    st.change(fn=update_duration_estimate,
+              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
+              outputs=duration_display)
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
     cnl.click(fn=cancel_generation, outputs=dbg)