ZeroGPU-LLM-Inference

Running

Luigi commited on Apr 12

Commit

b6b3940

1 Parent(s): 4731160

add attention mask

Files changed (1) hide show

app.py CHANGED Viewed

@@ -86,6 +86,12 @@ def load_model(model_name):
     # Load the model and tokenizer using Transformers.
     model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     LOADED_MODELS[model_name] = (model, tokenizer)
     CURRENT_MODEL_NAME = model_name
     return model, tokenizer
@@ -158,12 +164,16 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
         model, tokenizer = load_model(model_name)
         # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
         model = model.to('cuda')
-        # Tokenize the augmented prompt and move input tensors to GPU.
-        input_ids = tokenizer(augmented_user_input, return_tensors="pt").input_ids.to('cuda')
         with torch.no_grad():
             output_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 top_k=top_k,

     # Load the model and tokenizer using Transformers.
     model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
+    # If the pad token is missing or the same as the eos token, add a new pad token.
+    if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
+        tokenizer.add_special_tokens({'pad_token': '<pad>'})
+        model.resize_token_embeddings(len(tokenizer))
     LOADED_MODELS[model_name] = (model, tokenizer)
     CURRENT_MODEL_NAME = model_name
     return model, tokenizer
         model, tokenizer = load_model(model_name)
         # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
         model = model.to('cuda')
+        # Tokenize the augmented prompt with padding and retrieve the attention mask.
+        encoding = tokenizer(augmented_user_input, return_tensors="pt", padding=True)
+        input_ids = encoding["input_ids"].to('cuda')
+        attention_mask = encoding["attention_mask"].to('cuda')
         with torch.no_grad():
             output_ids = model.generate(
                 input_ids,
+                attention_mask=attention_mask,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 top_k=top_k,