Spaces:
Running
Running
File size: 1,269 Bytes
ab308db 13baf30 ab308db eb45e63 f54eedc eb45e63 13baf30 bef396b eb45e63 f54eedc eb45e63 f54eedc 13baf30 ab308db 13baf30 ab308db bef396b eb45e63 ab308db bef396b f54eedc ab308db f54eedc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions
from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, MAX_INPUT_LENGTH
options = ORTOptions(
enable_int8=True,
enable_dynamic_quantization=True,
enable_cpu_mem_optimization=True,
enable_flash_attention=False,
enable_sequential_execution=True
)
model = ORTModelForCausalLM.from_pretrained(
MODEL_NAME,
from_transformers=True,
ort_options=options,
device_map="cpu",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side="left"
)
def generate_response(input_texts):
inputs = tokenizer(
input_texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=MAX_INPUT_LENGTH,
add_special_tokens=True
)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=False,
num_beams=1,
early_stopping=True,
use_cache=True,
pad_token_id=tokenizer.eos_token_id
)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
__all__ = ["generate_response", "tokenizer"] |