Luigi commited on
Commit
efa3ae6
·
1 Parent(s): afa1066

Remove cancel generation feature as it didn't work

Browse files
Files changed (1) hide show
  1. app.py +0 -29
app.py CHANGED
@@ -18,18 +18,6 @@ access_token=os.environ['HF_TOKEN']
18
  # Optional: Disable GPU visibility if you wish to force CPU usage
19
  # os.environ["CUDA_VISIBLE_DEVICES"] = ""
20
 
21
- # ------------------------------
22
- # Global Cancellation Event
23
- # ------------------------------
24
- cancel_event = threading.Event()
25
-
26
- # ------------------------------
27
- # Stopping Criteria for Cancellation
28
- # ------------------------------
29
- class CancelStoppingCriteria(StoppingCriteria):
30
- def __call__(self, input_ids, scores, **kwargs):
31
- return cancel_event.is_set()
32
-
33
  # ------------------------------
34
  # Torch-Compatible Model Definitions with Adjusted Descriptions
35
  # ------------------------------
@@ -415,7 +403,6 @@ def chat_response(user_msg, chat_history, system_prompt,
415
  """
416
  Generates streaming chat responses, optionally with background web search.
417
  """
418
- cancel_event.clear()
419
  history = list(chat_history or [])
420
  history.append({'role': 'user', 'content': user_msg})
421
 
@@ -506,7 +493,6 @@ def chat_response(user_msg, chat_history, system_prompt,
506
  'top_p': top_p,
507
  'repetition_penalty': repeat_penalty,
508
  'streamer': streamer,
509
- 'stopping_criteria': [CancelStoppingCriteria()],
510
  'return_full_text': False,
511
  }
512
  )
@@ -519,8 +505,6 @@ def chat_response(user_msg, chat_history, system_prompt,
519
 
520
  # Stream tokens
521
  for chunk in streamer:
522
- if cancel_event.is_set():
523
- break
524
  text = chunk
525
 
526
  # Detect start of thinking
@@ -545,8 +529,6 @@ def chat_response(user_msg, chat_history, system_prompt,
545
  history.append({'role': 'assistant', 'content': answer_buf})
546
  else:
547
  history[-1]['content'] = thought_buf
548
- if cancel_event.is_set():
549
- break
550
  yield history, debug
551
  continue
552
 
@@ -562,8 +544,6 @@ def chat_response(user_msg, chat_history, system_prompt,
562
  history.append({'role': 'assistant', 'content': answer_buf})
563
  else:
564
  history[-1]['content'] = thought_buf
565
- if cancel_event.is_set():
566
- break
567
  yield history, debug
568
  continue
569
 
@@ -572,8 +552,6 @@ def chat_response(user_msg, chat_history, system_prompt,
572
  history.append({'role': 'assistant', 'content': ''})
573
  answer_buf += text
574
  history[-1]['content'] = answer_buf
575
- if cancel_event.is_set():
576
- break
577
  yield history, debug
578
 
579
  gen_thread.join()
@@ -585,11 +563,6 @@ def chat_response(user_msg, chat_history, system_prompt,
585
  gc.collect()
586
 
587
 
588
- def cancel_generation():
589
- cancel_event.set()
590
- return 'Generation cancelled.'
591
-
592
-
593
  def update_default_prompt(enable_search):
594
  return f"You are a helpful assistant."
595
 
@@ -642,7 +615,6 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
642
  mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
643
  st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
644
  clr = gr.Button("Clear Chat")
645
- cnl = gr.Button("Cancel Generation")
646
  with gr.Column(scale=7):
647
  chat = gr.Chatbot(type="messages")
648
  txt = gr.Textbox(placeholder="Type your message and press Enter...")
@@ -670,7 +642,6 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
670
 
671
  search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
672
  clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
673
- cnl.click(fn=cancel_generation, outputs=dbg)
674
  txt.submit(fn=chat_response,
675
  inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
676
  model_dd, max_tok, temp, k, p, rp, st],
 
18
  # Optional: Disable GPU visibility if you wish to force CPU usage
19
  # os.environ["CUDA_VISIBLE_DEVICES"] = ""
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # ------------------------------
22
  # Torch-Compatible Model Definitions with Adjusted Descriptions
23
  # ------------------------------
 
403
  """
404
  Generates streaming chat responses, optionally with background web search.
405
  """
 
406
  history = list(chat_history or [])
407
  history.append({'role': 'user', 'content': user_msg})
408
 
 
493
  'top_p': top_p,
494
  'repetition_penalty': repeat_penalty,
495
  'streamer': streamer,
 
496
  'return_full_text': False,
497
  }
498
  )
 
505
 
506
  # Stream tokens
507
  for chunk in streamer:
 
 
508
  text = chunk
509
 
510
  # Detect start of thinking
 
529
  history.append({'role': 'assistant', 'content': answer_buf})
530
  else:
531
  history[-1]['content'] = thought_buf
 
 
532
  yield history, debug
533
  continue
534
 
 
544
  history.append({'role': 'assistant', 'content': answer_buf})
545
  else:
546
  history[-1]['content'] = thought_buf
 
 
547
  yield history, debug
548
  continue
549
 
 
552
  history.append({'role': 'assistant', 'content': ''})
553
  answer_buf += text
554
  history[-1]['content'] = answer_buf
 
 
555
  yield history, debug
556
 
557
  gen_thread.join()
 
563
  gc.collect()
564
 
565
 
 
 
 
 
 
566
  def update_default_prompt(enable_search):
567
  return f"You are a helpful assistant."
568
 
 
615
  mc = gr.Number(value=50, precision=0, label="Max Chars/Result")
616
  st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
617
  clr = gr.Button("Clear Chat")
 
618
  with gr.Column(scale=7):
619
  chat = gr.Chatbot(type="messages")
620
  txt = gr.Textbox(placeholder="Type your message and press Enter...")
 
642
 
643
  search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
644
  clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
 
645
  txt.submit(fn=chat_response,
646
  inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
647
  model_dd, max_tok, temp, k, p, rp, st],