Update app.py
Browse files
app.py
CHANGED
|
@@ -90,9 +90,9 @@ def inference(query):
|
|
| 90 |
|
| 91 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
| 92 |
outputs = model.generate(tokenized_chat, **generation_params)
|
| 93 |
-
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
| 94 |
# assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
| 95 |
-
response =
|
| 96 |
response = tokenizer.decode(response, skip_special_tokens=True)
|
| 97 |
return response
|
| 98 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|
|
|
|
| 90 |
|
| 91 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
| 92 |
outputs = model.generate(tokenized_chat, **generation_params)
|
| 93 |
+
# decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
| 94 |
# assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
| 95 |
+
response = outputs[0][tokenized_chat.shape[-1]:]
|
| 96 |
response = tokenizer.decode(response, skip_special_tokens=True)
|
| 97 |
return response
|
| 98 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|