Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

efa3ae6

1 Parent(s): afa1066

Remove cancel generation feature as it didn't work

Browse files

Files changed (1) hide show

app.py +0 -29

app.py CHANGED Viewed

@@ -18,18 +18,6 @@ access_token=os.environ['HF_TOKEN']
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
-# ------------------------------
-# Global Cancellation Event
-# ------------------------------
-cancel_event = threading.Event()
-# ------------------------------
-# Stopping Criteria for Cancellation
-# ------------------------------
-class CancelStoppingCriteria(StoppingCriteria):
-    def __call__(self, input_ids, scores, **kwargs):
-        return cancel_event.is_set()
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
@@ -415,7 +403,6 @@ def chat_response(user_msg, chat_history, system_prompt,
     """
     Generates streaming chat responses, optionally with background web search.
     """
-    cancel_event.clear()
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
@@ -506,7 +493,6 @@ def chat_response(user_msg, chat_history, system_prompt,
                 'top_p': top_p,
                 'repetition_penalty': repeat_penalty,
                 'streamer': streamer,
-                'stopping_criteria': [CancelStoppingCriteria()],
                 'return_full_text': False,
             }
         )
@@ -519,8 +505,6 @@ def chat_response(user_msg, chat_history, system_prompt,
         # Stream tokens
         for chunk in streamer:
-            if cancel_event.is_set():
-                break
             text = chunk
             # Detect start of thinking
@@ -545,8 +529,6 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                if cancel_event.is_set():
-                    break
                 yield history, debug
                 continue
@@ -562,8 +544,6 @@ def chat_response(user_msg, chat_history, system_prompt,
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
-                if cancel_event.is_set():
-                    break
                 yield history, debug
                 continue
@@ -572,8 +552,6 @@ def chat_response(user_msg, chat_history, system_prompt,
                 history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
             history[-1]['content'] = answer_buf
-            if cancel_event.is_set():
-                break
             yield history, debug
         gen_thread.join()
@@ -585,11 +563,6 @@ def chat_response(user_msg, chat_history, system_prompt,
         gc.collect()
-def cancel_generation():
-    cancel_event.set()
-    return 'Generation cancelled.'
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
@@ -642,7 +615,6 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
             st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
             clr = gr.Button("Clear Chat")
-            cnl = gr.Button("Cancel Generation")
         with gr.Column(scale=7):
             chat = gr.Chatbot(type="messages")
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
@@ -670,7 +642,6 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
-    cnl.click(fn=cancel_generation, outputs=dbg)
     txt.submit(fn=chat_response,
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
                        model_dd, max_tok, temp, k, p, rp, st],

 # Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
     """
     Generates streaming chat responses, optionally with background web search.
     """
     history = list(chat_history or [])
     history.append({'role': 'user', 'content': user_msg})
                 'top_p': top_p,
                 'repetition_penalty': repeat_penalty,
                 'streamer': streamer,
                 'return_full_text': False,
             }
         )
         # Stream tokens
         for chunk in streamer:
             text = chunk
             # Detect start of thinking
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
                 yield history, debug
                 continue
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                     history[-1]['content'] = thought_buf
                 yield history, debug
                 continue
                 history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
             history[-1]['content'] = answer_buf
             yield history, debug
         gen_thread.join()
         gc.collect()
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
             mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
             st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
             clr = gr.Button("Clear Chat")
         with gr.Column(scale=7):
             chat = gr.Chatbot(type="messages")
             txt = gr.Textbox(placeholder="Type your message and press Enter...")
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
     txt.submit(fn=chat_response,
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
                        model_dd, max_tok, temp, k, p, rp, st],