Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 12

Commit

4c6b4c5

1 Parent(s): 4afc958

pin torch to 2.4.0

Browse files

Files changed (2) hide show

app.py +19 -19
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -7,19 +7,18 @@ from datetime import datetime
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import hf_hub_download
 from duckduckgo_search import DDGS
-import spaces
 # ------------------------------
 # Global Cancellation Event
 # ------------------------------
 cancel_event = threading.Event()
-# ------------------------------
-# Model Definitions and Global Variables (PyTorch/Transformers)
-# ------------------------------
-# Here, the repo_id should point to a model checkpoint that is compatible with Hugging Face Transformers.
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
@@ -70,7 +69,6 @@ MODELS = {
     },
 }
 LOADED_MODELS = {}
 CURRENT_MODEL_NAME = None
@@ -82,7 +80,7 @@ def load_model(model_name):
     if model_name in LOADED_MODELS:
         return LOADED_MODELS[model_name]
     selected_model = MODELS[model_name]
-    # Load both the model and tokenizer using the Transformers library.
     model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     LOADED_MODELS[model_name] = (model, tokenizer)
@@ -106,15 +104,15 @@ def retrieve_context(query, max_results=6, max_chars_per_result=600):
         return ""
 # ------------------------------
-# Chat Response Generation (Simulated Streaming) with Cancellation
 # ------------------------------
-@spaces.GPU
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
     # Reset the cancellation event.
     cancel_event.clear()
-    # Prepare internal history.
     internal_history = list(chat_history) if chat_history else []
     internal_history.append({"role": "user", "content": user_message})
@@ -138,7 +136,7 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
         retrieved_context = ""
         debug_message = "Web search disabled."
-    # Augment prompt with search context if available.
     if enable_search and retrieved_context:
         augmented_user_input = (
             f"{system_prompt.strip()}\n\n"
@@ -153,11 +151,13 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     internal_history.append({"role": "assistant", "content": ""})
     try:
-        # Load the PyTorch model and tokenizer.
         model, tokenizer = load_model(model_name)
-        # Tokenize the input prompt.
-        input_ids = tokenizer(augmented_user_input, return_tensors="pt").input_ids
         with torch.no_grad():
             output_ids = model.generate(
                 input_ids,
@@ -168,13 +168,12 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
                 repetition_penalty=repeat_penalty,
                 do_sample=True
             )
         # Decode the generated tokens.
         generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Strip the original prompt to isolate the assistant’s reply.
         assistant_text = generated_text[len(augmented_user_input):].strip()
-        # Simulate streaming by yielding the output word by word.
         words = assistant_text.split()
         assistant_message = ""
         for word in words:
@@ -205,7 +204,7 @@ def cancel_generation():
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     gr.Markdown("## 🧠 ZeroGPU LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select your model, set your system prompt, and adjust parameters on the left.")
     with gr.Row():
         with gr.Column(scale=3):
             default_model = list(MODELS.keys())[0] if MODELS else "No models available"
@@ -252,6 +251,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,

 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from duckduckgo_search import DDGS
+import spaces  # Import spaces early to enable ZeroGPU support
+# Disable GPU visibility if you wish to force CPU usage outside of GPU functions
+# (Not strictly needed for ZeroGPU as the decorator handles allocation)
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # ------------------------------
 # Global Cancellation Event
 # ------------------------------
 cancel_event = threading.Event()
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
     },
 }
 LOADED_MODELS = {}
 CURRENT_MODEL_NAME = None
     if model_name in LOADED_MODELS:
         return LOADED_MODELS[model_name]
     selected_model = MODELS[model_name]
+    # Load the model and tokenizer using Transformers.
     model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     LOADED_MODELS[model_name] = (model, tokenizer)
         return ""
 # ------------------------------
+# Chat Response Generation with ZeroGPU
 # ------------------------------
+@spaces.GPU(duration=60)  # This decorator triggers GPU allocation for up to 60 seconds.
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
     # Reset the cancellation event.
     cancel_event.clear()
+    # Prepare internal chat history.
     internal_history = list(chat_history) if chat_history else []
     internal_history.append({"role": "user", "content": user_message})
         retrieved_context = ""
         debug_message = "Web search disabled."
+    # Augment the prompt with search context if available.
     if enable_search and retrieved_context:
         augmented_user_input = (
             f"{system_prompt.strip()}\n\n"
     internal_history.append({"role": "assistant", "content": ""})
     try:
+        # Load the model and tokenizer.
         model, tokenizer = load_model(model_name)
+        # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
+        model = model.to('cuda')
+        # Tokenize the augmented prompt and move input tensors to GPU.
+        input_ids = tokenizer(augmented_user_input, return_tensors="pt").input_ids.to('cuda')
         with torch.no_grad():
             output_ids = model.generate(
                 input_ids,
                 repetition_penalty=repeat_penalty,
                 do_sample=True
             )
         # Decode the generated tokens.
         generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Remove the original prompt to isolate the assistant's reply.
         assistant_text = generated_text[len(augmented_user_input):].strip()
+        # Simulate streaming output by yielding word-by-word.
         words = assistant_text.split()
         assistant_message = ""
         for word in words:
 with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     gr.Markdown("## 🧠 ZeroGPU LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select your model, set your system prompt, and adjust parameters on the left.")
     with gr.Row():
         with gr.Column(scale=3):
             default_model = list(MODELS.keys())[0] if MODELS else "No models available"
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
+    # Submission: the chat_response function is now decorated with @spaces.GPU.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
 wheel
 streamlit
 duckduckgo_search
-gradio
-torch
 transformers
 spaces
 sentencepiece

 wheel
 streamlit
 duckduckgo_search
+gradio>=4.0.0
+torch==2.4.0
 transformers
 spaces
 sentencepiece