File size: 1,269 Bytes
ab308db
 
13baf30
ab308db
eb45e63
f54eedc
 
 
 
 
eb45e63
 
13baf30
bef396b
 
eb45e63
f54eedc
 
eb45e63
 
 
 
f54eedc
13baf30
ab308db
13baf30
ab308db
bef396b
 
 
 
eb45e63
 
ab308db
 
 
bef396b
 
f54eedc
 
 
 
 
ab308db
f54eedc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions
from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, MAX_INPUT_LENGTH

options = ORTOptions(
    enable_int8=True,
    enable_dynamic_quantization=True,
    enable_cpu_mem_optimization=True,
    enable_flash_attention=False,
    enable_sequential_execution=True
)

model = ORTModelForCausalLM.from_pretrained(
    MODEL_NAME,
    from_transformers=True,
    ort_options=options,
    device_map="cpu",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="left"
)

def generate_response(input_texts):
    inputs = tokenizer(
        input_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
        add_special_tokens=True
    )
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=False,
        num_beams=1,
        early_stopping=True,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

__all__ = ["generate_response", "tokenizer"]