Spaces:
Running
Running
| from transformers import AutoTokenizer | |
| from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions | |
| from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, MAX_INPUT_LENGTH | |
| options = ORTOptions( | |
| enable_int8=True, | |
| enable_dynamic_quantization=True, | |
| enable_cpu_mem_optimization=True, | |
| enable_flash_attention=False, | |
| enable_sequential_execution=True | |
| ) | |
| model = ORTModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| from_transformers=True, | |
| ort_options=options, | |
| device_map="cpu", | |
| trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| padding_side="left" | |
| ) | |
| def generate_response(input_texts): | |
| inputs = tokenizer( | |
| input_texts, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=MAX_INPUT_LENGTH, | |
| add_special_tokens=True | |
| ) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| temperature=TEMPERATURE, | |
| do_sample=False, | |
| num_beams=1, | |
| early_stopping=True, | |
| use_cache=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| return tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
| __all__ = ["generate_response", "tokenizer"] |