Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions | |
| # 选超轻量模型:微软Phi-3-mini(仅38亿参数,推理极快) | |
| model_name = "microsoft/Phi-3-mini-4k-instruct-ONNX" | |
| # 开启INT8量化+动态批处理,CPU计算量直接减半 | |
| options = ORTOptions(enable_int8=True, enable_dynamic_quantization=True) | |
| model = ORTModelForCausalLM.from_pretrained(model_name, from_transformers=True, ort_options=options) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def generate_text(input_texts): | |
| inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=32) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=8, # 生成长度砍到极致 | |
| temperature=0.1, # 温度调低,减少随机计算 | |
| do_sample=False, | |
| num_beams=1, | |
| early_stopping=True # 到句号就停,不做无用功 | |
| ) | |
| return tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
| # 界面支持多行输入(批量处理请求,CPU利用率拉满) | |
| iface = gr.Interface(fn=generate_text, inputs=gr.Textbox(multiline=True), outputs="text") | |
| iface.launch() |