ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on May 2

Commit

e2ee907

1 Parent(s): c00d442

user can define search timeout

Browse files

Files changed (1) hide show

app.py +4 -3

app.py CHANGED Viewed

@@ -116,7 +116,7 @@ def format_conversation(history, system_prompt, tokenizer):
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
-                  top_k, top_p, repeat_penalty):
     """
     Generates streaming chat responses, optionally with background web search.
     """
@@ -149,7 +149,7 @@ def chat_response(user_msg, chat_history, system_prompt,
         # wait up to 1s for snippets, then replace debug with them
         if enable_search:
-            thread_search.join(timeout=5.0)
             if search_results:
                 debug = "### Search results merged into prompt\n\n" + "\n".join(
                     f"- {r}" for r in search_results
@@ -280,6 +280,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             gr.Markdown("### Web Search Settings")
             mr = gr.Number(value=6, precision=0, label="Max Results")
             mc = gr.Number(value=600, precision=0, label="Max Chars/Result")
             clr = gr.Button("Clear Chat")
             cnl = gr.Button("Cancel Generation")
         with gr.Column(scale=7):
@@ -292,6 +293,6 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     cnl.click(fn=cancel_generation, outputs=dbg)
     txt.submit(fn=chat_response,
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
-                       model_dd, max_tok, temp, k, p, rp],
                outputs=[chat, dbg])
     demo.launch()

 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,
+                  top_k, top_p, repeat_penalty, search_timeout):
     """
     Generates streaming chat responses, optionally with background web search.
     """
         # wait up to 1s for snippets, then replace debug with them
         if enable_search:
+            thread_search.join(timeout=float(search_timeout))
             if search_results:
                 debug = "### Search results merged into prompt\n\n" + "\n".join(
                     f"- {r}" for r in search_results
             gr.Markdown("### Web Search Settings")
             mr = gr.Number(value=6, precision=0, label="Max Results")
             mc = gr.Number(value=600, precision=0, label="Max Chars/Result")
+            st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="Search Timeout (s)")
             clr = gr.Button("Clear Chat")
             cnl = gr.Button("Cancel Generation")
         with gr.Column(scale=7):
     cnl.click(fn=cancel_generation, outputs=dbg)
     txt.submit(fn=chat_response,
                inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
+                       model_dd, max_tok, temp, k, p, rp, st],
                outputs=[chat, dbg])
     demo.launch()