JqzAugUST's picture
Update app.py
7149349 verified
# MiniCPM-4.1-8B-Eagle3
from pathlib import Path
import time
import logging
import gradio as gr
import torch
import spaces
import threading
from transformers import AutoTokenizer, TextIteratorStreamer
# 导入模型相关模块
from eagle.model.ea_model import EaModel
from utils_chatbot import organize_messages, stream2display_text, mtp_new_tokens
# 日志配置
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 全局模型实例
global_model = None
# 全局模型缓存(在GPU进程中)
_gpu_model_cache = None
# 全局模型配置
model_config = dict(
base_model_path = "openbmb/MiniCPM4.1-8B",
ea_model_path = "openbmb/MiniCPM4.1-8B-Eagle3/MiniCPM4_1-8B-Eagle3-bf16",
total_token=40,
depth=3,
top_k=10,
threshold=1.0,
use_eagle3=True,
device_map = "cpu",
trust_remote_code=True,
)
# 提前加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"openbmb/MiniCPM4.1-8B",
use_fast=False,
device_map="cpu",
)
def _initialize_gpu_model():
"""在GPU进程中获取模型并移到GPU"""
global _gpu_model_cache
if _gpu_model_cache is None:
logger.info(f"在GPU进程中初始化模型")
_gpu_model_cache = EaModel.from_pretrained(**model_config)
logger.info(f"模型在CPU上初始化完成")
return _gpu_model_cache
@spaces.GPU(duration=42) # default is 60
def gpu_handler(inputs):
prompt_text = tokenizer.apply_chat_template(
inputs,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([prompt_text], return_tensors="pt")
inputs = {
"model_inputs": model_inputs,
"max_new_tokens": 65536,
"temperature": 0.6,
"top_p": 0.95,
"top_k": 50,
"max_length": 65536,
}
logger.info(f"向 GPU 搬运 global_model")
"""GPU推理处理器"""
model = _initialize_gpu_model()
cuda_inputs = dict(
input_ids=inputs["model_inputs"].input_ids.to("cuda"),
# attention_mask=inputs["model_inputs"].attention_mask.to("cuda"),
max_new_tokens=inputs["max_new_tokens"],
temperature=inputs["temperature"],
top_p=inputs["top_p"],
top_k=inputs["top_k"],
max_length=inputs["max_length"],
)
model.base_model.to("cuda")
model.ea_layer.to("cuda")
model.ea_layer.tree_mask_init.to("cuda")
logger.info(f"pass inputs to global_model")
output_ids = model.eagenerate(**cuda_inputs)
logger.info(f"got outputs from global_model.eagenerate")
new_text = tokenizer.decode(
output_ids[0][model_inputs.input_ids.shape[1]:],
skip_special_tokens=True,
)
return new_text
@spaces.GPU(duration=60) # default is 60
def gpu_handler_s(
inputs,
history,
temperature,
top_p,
use_eagle,
):
prompt_text = tokenizer.apply_chat_template(
inputs,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([prompt_text], return_tensors="pt")
inputs = {
"model_inputs": model_inputs,
"max_new_tokens": 4096,
"temperature": temperature,
"top_p": top_p,
"top_k": 50,
"max_length": 65536,
}
logger.info(f"向 GPU 搬运 global_model")
"""GPU推理处理器"""
model = _initialize_gpu_model()
cuda_inputs = dict(
input_ids=inputs["model_inputs"].input_ids.to("cuda"),
# attention_mask=inputs["model_inputs"].attention_mask.to("cuda"),
max_new_tokens=inputs["max_new_tokens"],
temperature=inputs["temperature"],
top_p=inputs["top_p"],
top_k=inputs["top_k"],
max_length=inputs["max_length"],
)
model.base_model.to("cuda")
model.ea_layer.to("cuda")
model.ea_layer.tree_mask_init.to("cuda")
logger.info(f"pass inputs to global_model")
yield "", history
stop_token_ids = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
gen_tk_count, existing_tk_count = 0, len(inputs["model_inputs"].input_ids[0])
stream_text, start_time = "", time.time()
generate_func = model.ea_generate if use_eagle else model.naive_generate
for output_ids in generate_func(**cuda_inputs):
# for output_ids in model.ea_generate(**cuda_inputs):
new_tokens, gen_tk_count = mtp_new_tokens(output_ids, gen_tk_count, existing_tk_count, stop_token_ids)
new_token_text = tokenizer.decode(new_tokens, skip_special_tokens=False)
logger.info(f"[MTP]'''{new_token_text}'''")
stream_text += new_token_text
token_per_sec = gen_tk_count / (time.time() - start_time)
display_text = stream2display_text(stream_text, token_per_sec)
history[-1] = (history[-1][0], display_text)
yield "", history
history[-1] = (history[-1][0], stream_text.replace("<|im_end|>", ""))
# 替换 history 为非 display 形态的 text
class Model:
"""模型封装类,不持有实际模型对象"""
def __init__(self):
logger.info(f"创建封装类")
def handler(self, inputs):
"""非流式推理处理器"""
return gpu_handler(inputs)
def stream_handler(self, inputs, history, **kwargs):
"""流式推理处理器"""
yield from gpu_handler_s(inputs, history, **kwargs)
def initialize_model():
"""初始化全局模型"""
global global_model, _gpu_model_cache
# 默认配置
logger.info(f"="*50)
logger.info(f"启动 MiniCPM-4.1-8B-Eagle3 Chatbot 服务")
logger.info(f"="*50)
# 创建模型封装类
global_model = Model()
# 在主进程中预加载模型到CPU(For faster 首次推理)
try:
logger.info("在主进程中预加载模型到 CPU...")
_gpu_model_cache = EaModel.from_pretrained(**model_config)
logger.info("模型在主进程CPU上预加载完成")
except Exception as e:
logger.warning(f"主进程预加载模型失败, 将在GPU进程中加载: {e}")
_gpu_model_cache = None
return global_model
def gen_response(message, history, temperature, top_p):
chat_msg_ls = organize_messages(message, history)
new_text = global_model.handler(chat_msg_ls)
history.append((message, new_text))
return "", history
def gen_response_stream(
message,
history,
temperature,
top_p,
use_eagle,
):
chat_msg_ls = organize_messages(message, history)
history.append((message, ""))
sampling_kwargs = dict(
temperature = temperature,
top_p = top_p,
use_eagle = use_eagle,
)
yield from global_model.stream_handler(chat_msg_ls, history, **sampling_kwargs)
def create_app():
assets_path = Path.cwd().absolute()/"assets"
gr.set_static_paths(paths=[assets_path])
logger.info(f"Static resource path: {assets_path}. READY.")
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="gray",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
)
with gr.Blocks(
theme=theme,
css="""
.logo-container {
text-align: center;
margin: 0.5rem 0 1rem 0;
}
.logo-container img {
height: 96px;
width: auto;
max-width: 200px;
display: inline-block;
}
.input-box {
border: 1px solid #2f63b8;
border-radius: 8px;
}
""",
) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/OpenBMB-MiniCPM.png" alt="MiniCPM Logo"></div>')
blank_1 = gr.HTML("<div style='height:1px;'></div>")
temperature = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="Temperature", scale=1)
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.01, label="Top-p", scale=1)
use_eagle = gr.Checkbox(label="Speculative Decoding", value=True)
blank_2 = gr.HTML("<div style='height:128px;'></div>")
clear = gr.Button("Clear History")
gr.Markdown(
"""
Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a>
"""
)
with gr.Column(scale=4):
chatbot = gr.Chatbot(label="Chat History", placeholder="Input to start a new chat", height=500)
prompt = gr.Textbox(
label="Input Text",
placeholder="Type your message here...",
lines=1,
# submit_btn=True,
elem_classes=["input-box"], # 自定义 class 供 css 使用
)
prompt.submit(gen_response_stream, inputs=[prompt, chatbot, temperature, top_p, use_eagle], outputs=[prompt, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
return demo
if __name__ == "__main__":
# 初始化模型
initialize_model()
# 创建并启动应用
demo = create_app()
demo.launch()