Spaces:
Sleeping
Sleeping
| #!/usr/bin/python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| docker build -t llm_eval_system:v20250724_1442 . | |
| docker stop llm_eval_system_7862 && docker rm llm_eval_system_7862 | |
| docker run -itd \ | |
| --name llm_eval_system_7862 \ | |
| --restart=always \ | |
| --network host \ | |
| -e port=7862 \ | |
| llm_eval_system:v20250724_1442 \ | |
| /bin/bash | |
| docker run -itd \ | |
| --name llm_eval_system_7862 \ | |
| --restart=always \ | |
| --network host \ | |
| -v /data/tianxing/PycharmProjects/llm_eval_system:/data/tianxing/PycharmProjects/llm_eval_system \ | |
| python:3.12 \ | |
| /bin/bash | |
| nohup python3 main.py --server_port 7862 & | |
| """ | |
| import argparse | |
| import json | |
| import logging | |
| from pathlib import Path | |
| import platform | |
| from typing import Tuple, List | |
| import time | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from project_settings import environment, project_path, log_directory | |
| from toolbox.os.command import Command | |
| import log | |
| from tabs.fs_tab import get_fs_tab | |
| from tabs.shell_tab import get_shell_tab | |
| log.setup_size_rotating(log_directory=log_directory) | |
| logger = logging.getLogger("main") | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--eval_data_dir", | |
| default=(project_path / "data/eval_data").as_posix(), | |
| type=str, | |
| ) | |
| parser.add_argument( | |
| "--server_port", | |
| default=7860, | |
| type=int, | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| css = """ | |
| #dataset_df th:nth-child(1), #dataset_df td:nth-child(1) { | |
| max-width: 50px !important; /* 第一列 */ | |
| } | |
| #dataset_df th:nth-child(2), #dataset_df td:nth-child(2) { | |
| max-width: 500px !important; /* 第二列 */ | |
| } | |
| #dataset_df th:nth-child(3), #dataset_df td:nth-child(3) { | |
| max-width: 50px !important; /* 第三列 */ | |
| } | |
| #view_chat_df th:nth-child(1), #view_chat_df td:nth-child(1) { | |
| max-width: 50px !important; /* 第一列 */ | |
| } | |
| #view_chat_df th:nth-child(2), #view_chat_df td:nth-child(2) { | |
| max-width: 500px !important; /* 第二列 */ | |
| } | |
| #view_chat_df th:nth-child(3), #view_chat_df td:nth-child(3) { | |
| max-width: 400px !important; /* 第三列 */ | |
| } | |
| #view_chat_df th:nth-child(4), #view_chat_df td:nth-child(4) { | |
| max-width: 400px !important; /* 第四列 */ | |
| } | |
| #view_chat_df th:nth-child(5), #view_chat_df td:nth-child(5) { | |
| max-width: 400px !important; /* 第五列 */ | |
| } | |
| #view_chat_df th:nth-child(6), #view_chat_df td:nth-child(6) { | |
| max-width: 80px !important; /* 第六列 */ | |
| } | |
| """ | |
| eval_data_dir: Path = None | |
| llm_ranking: pd.DataFrame = None | |
| last_update_ts: float = 0 | |
| update_interval = 1 * 60 * 60 | |
| def load_board(): | |
| result = list() | |
| for filename in eval_data_dir.glob("**/*.jsonl"): | |
| name = filename.stem | |
| dataset = filename.parts[-1] | |
| date = filename.parts[-2] | |
| service = filename.parts[-3] | |
| client = filename.parts[-4] | |
| model_name = filename.parts[-5] | |
| company = filename.parts[-6] | |
| script = filename.parts[-7] | |
| if date.endswith("-delete"): | |
| continue | |
| # if name.endswith("-chat"): | |
| # continue | |
| score_list = list() | |
| time_cost_list = list() | |
| total = 0 | |
| with open(filename.as_posix(), "r", encoding="utf-8") as f: | |
| for row in f: | |
| try: | |
| row = json.loads(row) | |
| except Exception as e: | |
| print(f"json load row failed. error type: {type(e)}, error text: {str(e)}") | |
| logger.error(f"json load row failed. error type: {type(e)}, error text: {str(e)}") | |
| raise e | |
| if name.endswith("-choice"): | |
| score_ = row["correct"] | |
| elif name.endswith("-chat"): | |
| score_ = row["score"] | |
| elif name.endswith("-summary"): | |
| score_ = row["score"] | |
| else: | |
| raise AssertionError | |
| time_cost_ = row["time_cost"] | |
| score_list.append(score_) | |
| time_cost_list.append(time_cost_) | |
| total += 1 | |
| if total == 0: | |
| continue | |
| score = np.mean(score_list) | |
| time_cost_mean = np.mean(time_cost_list) | |
| time_cost_var = np.var(time_cost_list) | |
| time_cost_p75 = np.percentile(time_cost_list, 95) | |
| time_cost_p95 = np.percentile(time_cost_list, 95) | |
| time_cost_p99 = np.percentile(time_cost_list, 99) | |
| row_ = { | |
| "company": company, | |
| "model_name": model_name, | |
| "dataset": dataset, | |
| "score": round(score, 4), | |
| "time_cost(mean)": round(time_cost_mean, 4), | |
| "time_cost(var)": round(time_cost_var, 4), | |
| "time_cost(75%)": round(time_cost_p75, 4), | |
| "time_cost(95%)": round(time_cost_p95, 4), | |
| "time_cost(99%)": round(time_cost_p99, 4), | |
| "service": service, | |
| "client": client, | |
| "script": f"{script}.py", | |
| "version": date, | |
| "count": total, | |
| } | |
| result.append(row_) | |
| result = pd.DataFrame(result) | |
| return result | |
| def load_board_lazy(): | |
| global llm_ranking | |
| global last_update_ts | |
| now = time.time() | |
| if now - last_update_ts > update_interval: | |
| llm_ranking = load_board() | |
| last_update_ts = now | |
| return llm_ranking | |
| def when_click_board_button(columns: List[str]): | |
| result = load_board_lazy() | |
| try: | |
| result = result[columns] | |
| except KeyError as e: | |
| raise gr.Error(f"{str(e)}, columns: {list(result.columns)}") | |
| return result | |
| def when_click_view_dataset_button(filename: str): | |
| filename = (project_path / filename).as_posix() | |
| result = list() | |
| with open(filename, "r", encoding="utf-8") as f: | |
| for row in f: | |
| row = json.loads(row) | |
| result.append(row) | |
| result = pd.DataFrame(result) | |
| return result | |
| def when_click_view_chat_button(filename: str): | |
| filename = (project_path / filename).as_posix() | |
| result = list() | |
| with open(filename, "r", encoding="utf-8") as f: | |
| for row in f: | |
| row = json.loads(row) | |
| idx = row["idx"] | |
| prompt: str = row["prompt"] | |
| conversation = prompt.split("\n\n")[-1].strip() | |
| response = row["response"] | |
| prediction = row["prediction"] | |
| evaluate = row["evaluate"] | |
| score = row["score"] | |
| row_ = { | |
| "idx": idx, | |
| "conversation": conversation, | |
| "response": response, | |
| "prediction": prediction, | |
| "evaluate": json.dumps(evaluate, ensure_ascii=False, indent=4), | |
| "score": score, | |
| } | |
| result.append(row_) | |
| result = pd.DataFrame(result) | |
| return result | |
| board_columns_choices = [ | |
| "company", "model_name", "dataset", "score", | |
| "time_cost(mean)", | |
| "time_cost(var)", | |
| "time_cost(75%)", "time_cost(95%)", "time_cost(99%)", | |
| "service", "client", | |
| "script", "version", "count" | |
| ] | |
| board_columns_choices_default_value = [ | |
| "company", "model_name", "dataset", "score", | |
| "time_cost(mean)", | |
| "time_cost(var)", | |
| # "time_cost(75%)", "time_cost(95%)", "time_cost(99%)", | |
| ] | |
| dataset_examples_list = [ | |
| [ | |
| "arc-easy-1000-choice.jsonl", | |
| "ARC(AI2 推理挑战赛)\nAI2 的推理挑战赛 (ARC) 数据集是一个多项选择题问答数据集,包含 3 年级至 9 年级的科学考试题目。\n该数据集分为两个部分:简单部分和挑战部分。\n\n从简单部分取前1000条作为 arc-easy-1000-choice.jsonl", | |
| "data/dataset/arc-easy-1000-choice.jsonl" | |
| ], | |
| [ | |
| "agent-lingoace-zh-400-choice.jsonl", | |
| "lingoace数据集。", | |
| "data/dataset/agent-lingoace-zh-400-choice.jsonl" | |
| ], | |
| ] | |
| def main(): | |
| args = get_args() | |
| global eval_data_dir | |
| global llm_ranking | |
| eval_data_dir = Path(args.eval_data_dir) | |
| llm_ranking_board = when_click_board_button(board_columns_choices_default_value) | |
| # chat | |
| chat_eval_data_examples = list() | |
| for filename in eval_data_dir.glob("**/*-chat.jsonl"): | |
| dataset = filename.parts[-1] | |
| model_name = filename.parts[-5] | |
| company = filename.parts[-6] | |
| chat_eval_data_examples.append([ | |
| company, model_name, dataset, filename.as_posix() | |
| ]) | |
| # ui | |
| with gr.Blocks(css=css) as blocks: | |
| with gr.Tabs(): | |
| with gr.TabItem("board"): | |
| board_columns = gr.CheckboxGroup( | |
| choices=board_columns_choices, | |
| value=board_columns_choices_default_value, | |
| label="columns" | |
| ) | |
| board_button = gr.Button(value="View", variant="primary", visible=True) | |
| board_board = gr.DataFrame( | |
| value=llm_ranking_board, | |
| max_height=800, min_width=160, | |
| label="board", | |
| # interactive=True, | |
| show_search="search" | |
| ) | |
| board_button.click( | |
| fn=when_click_board_button, | |
| inputs=[board_columns], | |
| outputs=[board_board], | |
| ) | |
| with gr.TabItem("view_chat"): | |
| view_chat_company = gr.Textbox(label="company", visible=False) | |
| view_chat_model_name = gr.Textbox(label="model_name", visible=False) | |
| view_chat_dataset = gr.Textbox(label="dataset", visible=False) | |
| view_chat_filename = gr.Textbox(label="filename", visible=True) | |
| gr.Examples( | |
| examples=chat_eval_data_examples, | |
| inputs=[view_chat_company, view_chat_model_name, view_chat_dataset, view_chat_filename], | |
| outputs=None, | |
| ) | |
| with gr.Row(): | |
| view_chat_button = gr.Button(value="View", variant="primary", visible=True) | |
| view_chat_df = gr.DataFrame( | |
| value=None, | |
| max_height = 1000, min_width = 160, | |
| label="dataset", interactive=True, | |
| show_search="search", | |
| elem_id="view_chat_df" | |
| ) | |
| view_chat_button.click( | |
| fn=when_click_view_chat_button, | |
| inputs=[view_chat_filename], | |
| outputs=[view_chat_df], | |
| ) | |
| with gr.TabItem("dataset"): | |
| dataset_name = gr.Textbox(label="name") | |
| dataset_desc = gr.Textbox(label="desc") | |
| dataset_filename = gr.Textbox(label="filename") | |
| gr.Examples( | |
| examples=dataset_examples_list, | |
| inputs=[dataset_name, dataset_desc, dataset_filename], | |
| outputs=None, | |
| ) | |
| dataset_button = gr.Button(value="View", variant="primary", visible=True) | |
| dataset_df = gr.DataFrame( | |
| value=None, label="dataset", interactive=True, | |
| show_search="search", | |
| elem_id="dataset_df" | |
| ) | |
| dataset_button.click( | |
| fn=when_click_view_dataset_button, | |
| inputs=[dataset_filename], | |
| outputs=[dataset_df], | |
| ) | |
| _ = get_fs_tab() | |
| _ = get_shell_tab() | |
| # http://127.0.0.1:7861/ | |
| # http://10.75.27.247:7861/ | |
| blocks.queue().launch( | |
| share=False if platform.system() == "Windows" else False, | |
| server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0", | |
| # server_name="0.0.0.0", | |
| server_port=environment.get("port", default=args.server_port, dtype=int), | |
| ) | |
| return | |
| if __name__ == "__main__": | |
| main() | |