Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

426163f

1 Parent(s): 273acf8

Remove AOT compilation completely and enable use_cache

Browse files

Files changed (1) hide show

app.py +3 -37

app.py CHANGED Viewed

@@ -338,7 +338,7 @@ def load_pipeline(model_name):
                 trust_remote_code=True,
                 torch_dtype=dtype,
                 device_map="auto",
-                use_cache=False,      # ← disable past-key-value caching
                 token=access_token)
             PIPELINES[model_name] = pipe
             return pipe
@@ -350,7 +350,8 @@ def load_pipeline(model_name):
         model=repo,
         tokenizer=tokenizer,
         trust_remote_code=True,
-        device_map="auto"
     )
     PIPELINES[model_name] = pipe
     return pipe
@@ -483,41 +484,6 @@ def chat_response(user_msg, chat_history, system_prompt,
         pipe = load_pipeline(model_name)
-        # Determine actual model size for AOT decision
-        actual_params = sum(p.numel() for p in pipe.model.parameters())
-        model_size_b = actual_params / 1e9  # Convert to billions
-        use_aot = model_size_b >= 2  # Only compile models >= 2B parameters
-        if use_aot:
-            try:
-                with spaces.aoti_capture(pipe.model) as call:
-                    pipe("Hello world", max_new_tokens=5, do_sample=False, pad_token_id=pipe.tokenizer.eos_token_id)
-                # Define dynamic shapes for variable sequence lengths
-                seq_dim = torch.export.Dim('seq', min=1, max=4096)
-                dynamic_shapes = {
-                    'input_ids': {1: seq_dim} if 'input_ids' in call.kwargs else None,
-                    'attention_mask': {1: seq_dim} if 'attention_mask' in call.kwargs else None,
-                    'inputs_embeds': None,
-                    'use_cache': None,
-                    'cache_position': {1: seq_dim} if 'cache_position' in call.kwargs or 'position_ids' in call.kwargs else None,
-                    'kwargs': {k: None for k in call.kwargs if k not in ['input_ids', 'attention_mask', 'inputs_embeds', 'use_cache', 'cache_position', 'position_ids']}
-                }
-                exported = torch.export.export(
-                    pipe.model,
-                    args=call.args,
-                    kwargs=call.kwargs,
-                    dynamic_shapes=dynamic_shapes
-                )
-                compiled = spaces.aoti_compile(exported)
-                spaces.aoti_apply(compiled, pipe.model)
-                print(f"AOT compilation successful for {model_name} ({model_size_b:.1f}B parameters)")
-            except Exception as e:
-                print(f"AOT compilation failed for {model_name}: {e}")
-        else:
-            print(f"Skipping AOT compilation for small model {model_name} ({model_size_b:.1f}B parameters)")
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,

                 trust_remote_code=True,
                 torch_dtype=dtype,
                 device_map="auto",
+                use_cache=True,      # Enable past-key-value caching
                 token=access_token)
             PIPELINES[model_name] = pipe
             return pipe
         model=repo,
         tokenizer=tokenizer,
         trust_remote_code=True,
+        device_map="auto",
+        use_cache=True
     )
     PIPELINES[model_name] = pipe
     return pipe
         pipe = load_pipeline(model_name)
         prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
         streamer = TextIteratorStreamer(pipe.tokenizer,