Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on 28 days ago

Commit

a7866ff

1 Parent(s): fea2910

Add AOT compilation optimization for ZeroGPU acceleration

Browse files

Files changed (1) hide show

app.py +32 -1

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from transformers import pipeline, TextIteratorStreamer
 from transformers import AutoTokenizer
 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
 access_token=os.environ['HF_TOKEN']
@@ -329,7 +330,7 @@ def format_conversation(history, system_prompt, tokenizer):
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
-    base_duration = 60
     token_duration = max_tokens * 0.1  # Estimate 0.1 seconds per token
     search_duration = 30 if enable_search else 0
     return base_duration + token_duration + search_duration
@@ -417,6 +418,36 @@ def chat_response(user_msg, chat_history, system_prompt,
             enriched = system_prompt
         pipe = load_pipeline(model_name)
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,

 from transformers import AutoTokenizer
 from ddgs import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
+from torch.utils._pytree import tree_map
 access_token=os.environ['HF_TOKEN']
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
+    base_duration = 120  # Increased for AOT compilation
     token_duration = max_tokens * 0.1  # Estimate 0.1 seconds per token
     search_duration = 30 if enable_search else 0
     return base_duration + token_duration + search_duration
             enriched = system_prompt
         pipe = load_pipeline(model_name)
+        # AOT compilation for performance optimization
+        try:
+            with spaces.aoti_capture(pipe.model) as call:
+                pipe("Hello world", max_new_tokens=5, do_sample=False, pad_token_id=pipe.tokenizer.eos_token_id)
+            # Define dynamic shapes for variable sequence lengths
+            seq_dim = torch.export.Dim('seq', min=1, max=4096)
+            dynamic_shapes = tree_map(lambda v: None, call.kwargs)
+            # Set dynamic dimensions for common inputs
+            if 'input_ids' in call.kwargs:
+                dynamic_shapes['input_ids'] = {1: seq_dim}
+            if 'attention_mask' in call.kwargs:
+                dynamic_shapes['attention_mask'] = {1: seq_dim}
+            if 'position_ids' in call.kwargs:
+                dynamic_shapes['position_ids'] = {1: seq_dim}
+            exported = torch.export.export(
+                pipe.model,
+                args=call.args,
+                kwargs=call.kwargs,
+                dynamic_shapes=dynamic_shapes
+            )
+            compiled = spaces.aoti_compile(exported)
+            spaces.aoti_apply(compiled, pipe.model)
+            print(f"AOT compilation successful for {model_name}")
+        except Exception as e:
+            print(f"AOT compilation failed for {model_name}: {e}")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,