Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

a94befb

1 Parent(s): efa3ae6

Restore cancel generation feature with improved UI integration

Browse files

Files changed (1) hide show

app.py +75 -25

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
 from torch.utils._pytree import tree_map
 access_token=os.environ['HF_TOKEN']
 # Optional: Disable GPU visibility if you wish to force CPU usage
@@ -402,7 +405,11 @@ def chat_response(user_msg, chat_history, system_prompt,
                   top_k, top_p, repeat_penalty, search_timeout):
     """
     Generates streaming chat responses, optionally with background web search.
     """
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
@@ -505,6 +512,12 @@ def chat_response(user_msg, chat_history, system_prompt,
         # Stream tokens
         for chunk in streamer:
             text = chunk
             # Detect start of thinking
@@ -560,6 +573,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
         gc.collect()
@@ -616,34 +630,70 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
             clr = gr.Button("Clear Chat")
         with gr.Column(scale=7):
-            chat = gr.Chatbot(type="messages")
-            txt = gr.Textbox(placeholder="Type your message and press Enter...")
             dbg = gr.Markdown()
     # Update duration estimate when relevant inputs change
-    model_dd.change(fn=update_duration_estimate,
-                    inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-                    outputs=duration_display)
-    search_chk.change(fn=update_duration_estimate,
-                      inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-                      outputs=duration_display)
-    max_tok.change(fn=update_duration_estimate,
-                   inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-                   outputs=duration_display)
-    mr.change(fn=update_duration_estimate,
-              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-              outputs=duration_display)
-    mc.change(fn=update_duration_estimate,
-              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-              outputs=duration_display)
-    st.change(fn=update_duration_estimate,
-              inputs=[model_dd, search_chk, mr, mc, max_tok, st],
-              outputs=duration_display)
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
-    txt.submit(fn=chat_response,
-               inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
-                       model_dd, max_tok, temp, k, p, rp, st],
-               outputs=[chat, dbg])
-    demo.launch()

 import spaces  # Import spaces early to enable ZeroGPU support
 from torch.utils._pytree import tree_map
+# Global event to signal cancellation from the UI thread to the generation thread
+cancel_event = threading.Event()
 access_token=os.environ['HF_TOKEN']
 # Optional: Disable GPU visibility if you wish to force CPU usage
                   top_k, top_p, repeat_penalty, search_timeout):
     """
     Generates streaming chat responses, optionally with background web search.
+    This version includes cancellation support.
     """
+    # Clear the cancellation event at the start of a new generation
+    cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
         # Stream tokens
         for chunk in streamer:
+            # Check for cancellation signal
+            if cancel_event.is_set():
+                history[-1]['content'] += " [Generation Canceled]"
+                yield history, debug
+                break
             text = chunk
             # Detect start of thinking
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
+        # Final cleanup
         gc.collect()
             st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
             clr = gr.Button("Clear Chat")
         with gr.Column(scale=7):
+            chat = gr.Chatbot(type="messages", height=600)
+            with gr.Row():
+                txt = gr.Textbox(placeholder="Type your message...", scale=8, container=False)
+                submit_btn = gr.Button("Submit", variant="primary", scale=1)
+                cancel_btn = gr.Button("⏹️ Cancel", variant="stop", visible=False, scale=1)
             dbg = gr.Markdown()
+    # --- Event Listeners ---
+    # Group all inputs for cleaner event handling
+    chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st]
+    def start_generation_and_update_ui(*args):
+        # Update UI to "generating" state
+        yield {
+            submit_btn: gr.update(interactive=False),
+            cancel_btn: gr.update(visible=True),
+            txt: gr.update(interactive=False, value=""), # Clear textbox and disable
+        }
+        # Call the actual chat response generator
+        for output in chat_response(*args):
+            yield {
+                chat: output[0],
+                dbg: output[1]
+            }
+    def reset_ui_after_generation():
+        # Update UI back to "idle" state
+        return {
+            submit_btn: gr.update(interactive=True),
+            cancel_btn: gr.update(visible=False),
+            txt: gr.update(interactive=True), # Re-enable textbox
+        }
+    def set_cancel_flag():
+        cancel_event.set()
+        print("Cancellation signal sent.")
+    # When the user submits their message (via button or enter)
+    submit_event = txt.submit(
+        fn=start_generation_and_update_ui,
+        inputs=chat_inputs,
+        outputs=[chat, dbg, submit_btn, cancel_btn, txt]
+    ).then(fn=reset_ui_after_generation, outputs=[submit_btn, cancel_btn, txt])
+    submit_btn.click(
+        fn=start_generation_and_update_ui,
+        inputs=chat_inputs,
+        outputs=[chat, dbg, submit_btn, cancel_btn, txt]
+    ).then(fn=reset_ui_after_generation, outputs=[submit_btn, cancel_btn, txt])
+    # When the user clicks the cancel button
+    cancel_btn.click(
+        fn=set_cancel_flag,
+        cancels=[submit_event] # This tells Gradio to stop the running `submit_event`
+    )
     # Update duration estimate when relevant inputs change
+    duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st]
+    for component in duration_inputs:
+        component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
+    # Other event listeners
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
+    demo.launch()