Spaces:
Running
Running
add attention mask
Browse files
app.py
CHANGED
|
@@ -86,6 +86,12 @@ def load_model(model_name):
|
|
| 86 |
# Load the model and tokenizer using Transformers.
|
| 87 |
model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
| 88 |
tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
LOADED_MODELS[model_name] = (model, tokenizer)
|
| 90 |
CURRENT_MODEL_NAME = model_name
|
| 91 |
return model, tokenizer
|
|
@@ -158,12 +164,16 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
|
|
| 158 |
model, tokenizer = load_model(model_name)
|
| 159 |
# Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
|
| 160 |
model = model.to('cuda')
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
with torch.no_grad():
|
| 165 |
output_ids = model.generate(
|
| 166 |
input_ids,
|
|
|
|
| 167 |
max_new_tokens=max_tokens,
|
| 168 |
temperature=temperature,
|
| 169 |
top_k=top_k,
|
|
|
|
| 86 |
# Load the model and tokenizer using Transformers.
|
| 87 |
model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
| 88 |
tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
| 89 |
+
|
| 90 |
+
# If the pad token is missing or the same as the eos token, add a new pad token.
|
| 91 |
+
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
|
| 92 |
+
tokenizer.add_special_tokens({'pad_token': '<pad>'})
|
| 93 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 94 |
+
|
| 95 |
LOADED_MODELS[model_name] = (model, tokenizer)
|
| 96 |
CURRENT_MODEL_NAME = model_name
|
| 97 |
return model, tokenizer
|
|
|
|
| 164 |
model, tokenizer = load_model(model_name)
|
| 165 |
# Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
|
| 166 |
model = model.to('cuda')
|
| 167 |
+
|
| 168 |
+
# Tokenize the augmented prompt with padding and retrieve the attention mask.
|
| 169 |
+
encoding = tokenizer(augmented_user_input, return_tensors="pt", padding=True)
|
| 170 |
+
input_ids = encoding["input_ids"].to('cuda')
|
| 171 |
+
attention_mask = encoding["attention_mask"].to('cuda')
|
| 172 |
|
| 173 |
with torch.no_grad():
|
| 174 |
output_ids = model.generate(
|
| 175 |
input_ids,
|
| 176 |
+
attention_mask=attention_mask,
|
| 177 |
max_new_tokens=max_tokens,
|
| 178 |
temperature=temperature,
|
| 179 |
top_k=top_k,
|