Spaces:

openbmb
/

MiniCPM4.1-8B-Demo

Build error

File size: 9,303 Bytes

# MiniCPM-4.1-8B-Eagle3

from pathlib import Path
import time
import logging
import gradio as gr
import torch
import spaces
import threading
from transformers import AutoTokenizer, TextIteratorStreamer
# 导入模型相关模块
from eagle.model.ea_model import EaModel
from utils_chatbot import organize_messages, stream2display_text, mtp_new_tokens

# 日志配置
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# 全局模型实例
global_model = None
# 全局模型缓存（在GPU进程中）
_gpu_model_cache = None
# 全局模型配置
model_config = dict(
    base_model_path = "openbmb/MiniCPM4.1-8B",
    ea_model_path = "openbmb/MiniCPM4.1-8B-Eagle3/MiniCPM4_1-8B-Eagle3-bf16",
    total_token=40,
    depth=3,
    top_k=10,
    threshold=1.0,
    use_eagle3=True,
    device_map = "cpu",
    trust_remote_code=True,
)

# 提前加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "openbmb/MiniCPM4.1-8B",
    use_fast=False,
    device_map="cpu",
)

def _initialize_gpu_model():
    """在GPU进程中获取模型并移到GPU"""
    global _gpu_model_cache
    if _gpu_model_cache is None:
        logger.info(f"在GPU进程中初始化模型")
        _gpu_model_cache = EaModel.from_pretrained(**model_config)
        logger.info(f"模型在CPU上初始化完成")
    return _gpu_model_cache

@spaces.GPU(duration=42) # default is 60
def gpu_handler(inputs):
    prompt_text = tokenizer.apply_chat_template(
        inputs,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = tokenizer([prompt_text], return_tensors="pt")
    inputs = {
        "model_inputs": model_inputs,
        "max_new_tokens": 65536,
        "temperature": 0.6,
        "top_p": 0.95,
        "top_k": 50,
        "max_length": 65536,
    }

    logger.info(f"向 GPU 搬运 global_model")

    """GPU推理处理器"""
    model = _initialize_gpu_model()

    cuda_inputs = dict(
        input_ids=inputs["model_inputs"].input_ids.to("cuda"),
        # attention_mask=inputs["model_inputs"].attention_mask.to("cuda"),
        max_new_tokens=inputs["max_new_tokens"],
        temperature=inputs["temperature"],
        top_p=inputs["top_p"],
        top_k=inputs["top_k"],
        max_length=inputs["max_length"],
    )

    model.base_model.to("cuda")
    model.ea_layer.to("cuda")
    model.ea_layer.tree_mask_init.to("cuda")

    logger.info(f"pass inputs to global_model")
    
    output_ids = model.eagenerate(**cuda_inputs)

    logger.info(f"got outputs from global_model.eagenerate")
    new_text = tokenizer.decode(
        output_ids[0][model_inputs.input_ids.shape[1]:],
        skip_special_tokens=True,
    )
    
    return new_text

@spaces.GPU(duration=60) # default is 60
def gpu_handler_s(
    inputs, 
    history, 
    temperature, 
    top_p,
    use_eagle,
):
    prompt_text = tokenizer.apply_chat_template(
        inputs,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = tokenizer([prompt_text], return_tensors="pt")
    inputs = {
        "model_inputs": model_inputs,
        "max_new_tokens": 4096,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": 50,
        "max_length": 65536,
    }

    logger.info(f"向 GPU 搬运 global_model")

    """GPU推理处理器"""
    model = _initialize_gpu_model()

    cuda_inputs = dict(
        input_ids=inputs["model_inputs"].input_ids.to("cuda"),
        # attention_mask=inputs["model_inputs"].attention_mask.to("cuda"),
        max_new_tokens=inputs["max_new_tokens"],
        temperature=inputs["temperature"],
        top_p=inputs["top_p"],
        top_k=inputs["top_k"],
        max_length=inputs["max_length"],
    )

    model.base_model.to("cuda")
    model.ea_layer.to("cuda")
    model.ea_layer.tree_mask_init.to("cuda")

    logger.info(f"pass inputs to global_model")

    yield "", history

    stop_token_ids = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    gen_tk_count, existing_tk_count = 0, len(inputs["model_inputs"].input_ids[0])

    stream_text, start_time = "", time.time()

    generate_func = model.ea_generate if use_eagle else model.naive_generate
    
    for output_ids in generate_func(**cuda_inputs):
    # for output_ids in model.ea_generate(**cuda_inputs):
        new_tokens, gen_tk_count = mtp_new_tokens(output_ids, gen_tk_count, existing_tk_count, stop_token_ids)
        new_token_text = tokenizer.decode(new_tokens, skip_special_tokens=False)
        logger.info(f"[MTP]'''{new_token_text}'''")
        stream_text += new_token_text
        token_per_sec = gen_tk_count / (time.time() - start_time)
        display_text = stream2display_text(stream_text, token_per_sec)
        history[-1] = (history[-1][0], display_text)
        yield "", history
    
    history[-1] = (history[-1][0], stream_text.replace("<|im_end|>", ""))
    # 替换 history 为非 display 形态的 text


class Model:
    """模型封装类，不持有实际模型对象"""
    
    def __init__(self):
        logger.info(f"创建封装类")

    def handler(self, inputs):
        """非流式推理处理器"""
        return gpu_handler(inputs)

    def stream_handler(self, inputs, history, **kwargs):
        """流式推理处理器"""
        yield from gpu_handler_s(inputs, history, **kwargs)


def initialize_model():
    """初始化全局模型"""
    global global_model, _gpu_model_cache
    
    # 默认配置
    logger.info(f"="*50)
    logger.info(f"启动 MiniCPM-4.1-8B-Eagle3 Chatbot 服务")
    logger.info(f"="*50)

    # 创建模型封装类
    global_model = Model()
    
    # 在主进程中预加载模型到CPU（For faster 首次推理）
    try:
        logger.info("在主进程中预加载模型到 CPU...")
        _gpu_model_cache = EaModel.from_pretrained(**model_config)
        logger.info("模型在主进程CPU上预加载完成")
    except Exception as e:
        logger.warning(f"主进程预加载模型失败, 将在GPU进程中加载: {e}")
        _gpu_model_cache = None
    
    return global_model


def gen_response(message, history, temperature, top_p):
    chat_msg_ls = organize_messages(message, history)

    new_text = global_model.handler(chat_msg_ls)

    history.append((message, new_text))
    return "", history

def gen_response_stream(
    message, 
    history,
    temperature,
    top_p,
    use_eagle,
):
    chat_msg_ls = organize_messages(message, history)

    history.append((message, ""))

    sampling_kwargs = dict(
        temperature = temperature,
        top_p = top_p,
        use_eagle = use_eagle,
    )

    yield from global_model.stream_handler(chat_msg_ls, history, **sampling_kwargs)

def create_app():
    assets_path = Path.cwd().absolute()/"assets"
    gr.set_static_paths(paths=[assets_path])
    logger.info(f"Static resource path: {assets_path}. READY.")
    
    theme = gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="gray",
        neutral_hue="slate",
        font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
    )
    
    with gr.Blocks(
        theme=theme,
        css="""
        .logo-container {
            text-align: center;
            margin: 0.5rem 0 1rem 0;
        }
        .logo-container img {
            height: 96px;
            width: auto;
            max-width: 200px;
            display: inline-block;
        }
        .input-box {
            border: 1px solid #2f63b8;
            border-radius: 8px;
        }
        """,
    ) as demo:
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/OpenBMB-MiniCPM.png" alt="MiniCPM Logo"></div>')

                blank_1 = gr.HTML("<div style='height:1px;'></div>")

                temperature = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="Temperature", scale=1)
                top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.01, label="Top-p", scale=1)
                use_eagle = gr.Checkbox(label="Speculative Decoding", value=True)

                blank_2 = gr.HTML("<div style='height:128px;'></div>")

                clear = gr.Button("Clear History")
                
                gr.Markdown(
                    """
                    Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a>
                    """
                )
            with gr.Column(scale=4):
                chatbot = gr.Chatbot(label="Chat History", placeholder="Input to start a new chat", height=500)
                prompt = gr.Textbox(
                    label="Input Text", 
                    placeholder="Type your message here...", 
                    lines=1, 
                    # submit_btn=True,
                    elem_classes=["input-box"],   # 自定义 class 供 css 使用
                )

        prompt.submit(gen_response_stream, inputs=[prompt, chatbot, temperature, top_p, use_eagle], outputs=[prompt, chatbot])
        clear.click(lambda: None, None, chatbot, queue=False)

    return demo


if __name__ == "__main__":
    # 初始化模型
    initialize_model()
    
    # 创建并启动应用
    demo = create_app()
    demo.launch()