ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 12

Commit

5ea073d

1 Parent(s): ef361b0

make streaming response

Browse files

Files changed (1) hide show

app.py +36 -25

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from itertools import islice
 from datetime import datetime
 import gradio as gr
 import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from duckduckgo_search import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
@@ -134,19 +134,19 @@ def format_conversation(conversation, system_prompt):
     return prompt
 # ------------------------------
-# Chat Response Generation with ZeroGPU using Pipeline
 # ------------------------------
 @spaces.GPU(duration=60)
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
     """
-    Generate a chat response by utilizing a transformers pipeline.
     - Appends the user's message to the conversation history.
     - Optionally retrieves web search context and inserts it as an additional system message.
     - Converts the conversation into a formatted prompt to avoid leaking role labels.
-    - Uses the cached pipeline to generate a response.
-    - Returns the updated conversation history and a debug message.
     """
     cancel_event.clear()
@@ -179,32 +179,43 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     conversation.append({"role": "assistant", "content": ""})
     try:
-        # Format the entire conversation into a single prompt (this fixes both issues).
         prompt_text = format_conversation(conversation, system_prompt)
-        # Load the pipeline (cached) for the selected model.
         pipe = load_pipeline(model_name)
-        # Generate a response using the formatted prompt.
-        response = pipe(
-            prompt_text,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repeat_penalty,
-        )
-        # Extract the generated text.
-        generated = response[0]["generated_text"]
-        # Remove the prompt portion so we only keep the new assistant reply.
-        assistant_text = generated[len(prompt_text):].strip()
-        # Update the conversation history.
-        conversation[-1]["content"] = assistant_text
-        # Yield the complete conversation history and the debug message.
-        yield conversation, debug_message
     except Exception as e:
         conversation[-1]["content"] = f"Error: {e}"
         yield conversation, debug_message
@@ -288,7 +299,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
-    # Submission: the chat_response function is used with the Transformers pipeline.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,

 from datetime import datetime
 import gradio as gr
 import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from duckduckgo_search import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
     return prompt
 # ------------------------------
+# Chat Response Generation with ZeroGPU using Pipeline (Streaming Token-by-Token)
 # ------------------------------
 @spaces.GPU(duration=60)
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
     """
+    Generate a chat response by utilizing a transformers pipeline with streaming.
     - Appends the user's message to the conversation history.
     - Optionally retrieves web search context and inserts it as an additional system message.
     - Converts the conversation into a formatted prompt to avoid leaking role labels.
+    - Uses the cached pipeline’s underlying model and tokenizer with a streamer to yield tokens as they are generated.
+    - Yields updated conversation history token by token.
     """
     cancel_event.clear()
     conversation.append({"role": "assistant", "content": ""})
     try:
+        # Format the entire conversation into a single prompt.
         prompt_text = format_conversation(conversation, system_prompt)
+        # Load the pipeline.
         pipe = load_pipeline(model_name)
+        # Obtain the underlying tokenizer and model.
+        tokenizer = pipe.tokenizer
+        model = pipe.model
+        # Tokenize the formatted prompt.
+        model_inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
+        # Set up a streamer for token-by-token generation.
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Run generate in a background thread with the streamer.
+        gen_kwargs = {
+            "input_ids": model_inputs.input_ids,
+            "attention_mask": model_inputs.attention_mask,
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_p": top_p,
+            "repetition_penalty": repeat_penalty,
+            "streamer": streamer
+        }
+        thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+        thread.start()
+        # Collect tokens from the streamer as they are generated.
+        assistant_text = ""
+        for new_text in streamer:
+            assistant_text += new_text
+            conversation[-1]["content"] = assistant_text
+            yield conversation, debug_message  # Update UI token by token
+        thread.join()
     except Exception as e:
         conversation[-1]["content"] = f"Error: {e}"
         yield conversation, debug_message
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
+    # Submission: the chat_response function is used with streaming.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,