Spaces:
Running
Running
File size: 1,328 Bytes
34cc479 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import gradio as gr
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions
# 选超轻量模型:微软Phi-3-mini(仅38亿参数,推理极快)
model_name = "microsoft/Phi-3-mini-4k-instruct-ONNX"
# 开启INT8量化+动态批处理,CPU计算量直接减半
options = ORTOptions(enable_int8=True, enable_dynamic_quantization=True)
model = ORTModelForCausalLM.from_pretrained(model_name, from_transformers=True, ort_options=options)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def generate_text(input_texts):
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
outputs = model.generate(
**inputs,
max_new_tokens=8, # 生成长度砍到极致
temperature=0.1, # 温度调低,减少随机计算
do_sample=False,
num_beams=1,
early_stopping=True # 到句号就停,不做无用功
)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
# 界面支持多行输入(批量处理请求,CPU利用率拉满)
iface = gr.Interface(fn=generate_text, inputs=gr.Textbox(multiline=True), outputs="text")
iface.launch() |