Spaces:
Sleeping
Sleeping
Commit
·
4464055
0
Parent(s):
first commit
Browse files- .gitattributes +37 -0
- .gitignore +16 -0
- Dockerfile +24 -0
- README.md +16 -0
- examples/make_dataset/make_arc_easy_1000_choice.py +85 -0
- examples/make_dataset/make_lingoace_400_choice.py +70 -0
- examples/make_dataset/make_lingoace_80_chat.py +222 -0
- examples/test_metrics/lingoace_chat_metric.py +387 -0
- examples/test_metrics/lingoace_chat_metric_test.py +482 -0
- llm_eval_script/aws_claude.py +222 -0
- llm_eval_script/aws_claude_chat.py +213 -0
- llm_eval_script/azure_openai.py +192 -0
- llm_eval_script/azure_openai_chat.py +183 -0
- llm_eval_script/byteplus.py +153 -0
- llm_eval_script/gemini_google.py +176 -0
- llm_eval_script/gemini_google_chat.py +167 -0
- llm_eval_script/ollama.py +101 -0
- llm_eval_script/siliconflow.py +236 -0
- llm_eval_script/siliconflow_chat.py +225 -0
- log.py +222 -0
- main.py +75 -0
- project_settings.py +22 -0
- requirements.txt +4 -0
- tabs/__init__.py +6 -0
- tabs/fs_tab.py +67 -0
- tabs/shell_tab.py +28 -0
- toolbox/__init__.py +6 -0
- toolbox/os/__init__.py +6 -0
- toolbox/os/command.py +59 -0
- toolbox/os/environment.py +48 -0
- toolbox/os/other.py +9 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
.git/
|
| 3 |
+
.idea/
|
| 4 |
+
|
| 5 |
+
/data/
|
| 6 |
+
#/data/eval_data
|
| 7 |
+
/data/raw_dataset
|
| 8 |
+
/dotenv/
|
| 9 |
+
/logs/
|
| 10 |
+
/trained_models
|
| 11 |
+
/temp/
|
| 12 |
+
|
| 13 |
+
**/__pycache__/
|
| 14 |
+
|
| 15 |
+
#**/*.wav
|
| 16 |
+
**/*.xlsx
|
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12
|
| 2 |
+
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
|
| 5 |
+
COPY . /code
|
| 6 |
+
|
| 7 |
+
RUN apt-get update
|
| 8 |
+
RUN apt-get install -y wget unzip ffmpeg build-essential git
|
| 9 |
+
|
| 10 |
+
RUN pip install --upgrade pip
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 12 |
+
|
| 13 |
+
RUN useradd -m -u 1000 user
|
| 14 |
+
|
| 15 |
+
USER user
|
| 16 |
+
|
| 17 |
+
ENV HOME=/home/user \
|
| 18 |
+
PATH=/home/user/.local/bin:$PATH
|
| 19 |
+
|
| 20 |
+
WORKDIR $HOME/app
|
| 21 |
+
|
| 22 |
+
COPY --chown=user . $HOME/app
|
| 23 |
+
|
| 24 |
+
CMD ["python3", "main.py"]
|
README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: LLM Eval System
|
| 3 |
+
emoji: 🐨
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 13 |
+
|
| 14 |
+
大模型评估系统
|
| 15 |
+
https://www.tapd.cn/tapd_fe/37829361/story/detail/1137829361001012418
|
| 16 |
+
|
examples/make_dataset/make_arc_easy_1000_choice.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
| 12 |
+
|
| 13 |
+
from project_settings import environment, project_path
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_args():
|
| 17 |
+
parser = argparse.ArgumentParser()
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--raw_dataset",
|
| 20 |
+
default=(project_path / "data/raw_dataset/arc-easy-1000-choice.jsonl").as_posix(),
|
| 21 |
+
type=str
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--dataset",
|
| 25 |
+
default=(project_path / "data/dataset/arc-easy-1000-choice.jsonl").as_posix(),
|
| 26 |
+
type=str
|
| 27 |
+
)
|
| 28 |
+
args = parser.parse_args()
|
| 29 |
+
return args
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
args = get_args()
|
| 34 |
+
|
| 35 |
+
dataset = Path(args.dataset)
|
| 36 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
|
| 38 |
+
with open(args.raw_dataset, "r", encoding="utf-8") as fin, open(args.dataset, "w", encoding="utf-8") as fout:
|
| 39 |
+
for row in fin:
|
| 40 |
+
row = json.loads(row)
|
| 41 |
+
idx = row["id"]
|
| 42 |
+
question = row["question"]
|
| 43 |
+
choices = row["choices"]
|
| 44 |
+
answer_key = row["answerkey"]
|
| 45 |
+
|
| 46 |
+
instruct = "Complete this single-choice question."
|
| 47 |
+
|
| 48 |
+
choices_str = ""
|
| 49 |
+
for choice in choices:
|
| 50 |
+
label = choice["label"]
|
| 51 |
+
text = choice["text"]
|
| 52 |
+
choices_str += f"If you think the answer is `{text}` output: `{label}`\n"
|
| 53 |
+
choices_str = choices_str.strip()
|
| 54 |
+
|
| 55 |
+
prompt = f"""
|
| 56 |
+
{instruct}
|
| 57 |
+
|
| 58 |
+
Question:
|
| 59 |
+
{question}
|
| 60 |
+
|
| 61 |
+
Choices:
|
| 62 |
+
{choices_str}
|
| 63 |
+
|
| 64 |
+
Remember to output ONLY the corresponding letter.
|
| 65 |
+
Your output is:
|
| 66 |
+
""".strip()
|
| 67 |
+
|
| 68 |
+
response = answer_key
|
| 69 |
+
|
| 70 |
+
print(f"{prompt}\n{response}")
|
| 71 |
+
print("-" * 150)
|
| 72 |
+
|
| 73 |
+
row_ = {
|
| 74 |
+
"idx": idx,
|
| 75 |
+
"prompt": prompt,
|
| 76 |
+
"response": response,
|
| 77 |
+
}
|
| 78 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 79 |
+
fout.write(f"{row_}\n")
|
| 80 |
+
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
examples/make_dataset/make_lingoace_400_choice.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
| 12 |
+
|
| 13 |
+
from project_settings import environment, project_path
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_args():
|
| 17 |
+
parser = argparse.ArgumentParser()
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--raw_dataset",
|
| 20 |
+
default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(),
|
| 21 |
+
type=str
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--dataset",
|
| 25 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
|
| 26 |
+
type=str
|
| 27 |
+
)
|
| 28 |
+
args = parser.parse_args()
|
| 29 |
+
return args
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
args = get_args()
|
| 34 |
+
|
| 35 |
+
raw_dataset = Path(args.raw_dataset)
|
| 36 |
+
dataset = Path(args.dataset)
|
| 37 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
| 40 |
+
for sample_dir in raw_dataset.glob("*"):
|
| 41 |
+
idx = sample_dir.parts[-1]
|
| 42 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
| 43 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
| 44 |
+
response_file = sample_dir / "response.txt"
|
| 45 |
+
|
| 46 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 47 |
+
system_prompt = f.read()
|
| 48 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 49 |
+
user_prompt = f.read()
|
| 50 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
| 51 |
+
response = f.read()
|
| 52 |
+
|
| 53 |
+
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
|
| 54 |
+
|
| 55 |
+
print(f"{prompt}\n\n{response}")
|
| 56 |
+
print("-" * 150)
|
| 57 |
+
|
| 58 |
+
row_ = {
|
| 59 |
+
"idx": idx,
|
| 60 |
+
"prompt": prompt,
|
| 61 |
+
"response": response,
|
| 62 |
+
}
|
| 63 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 64 |
+
fout.write(f"{row_}\n")
|
| 65 |
+
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
main()
|
examples/make_dataset/make_lingoace_80_chat.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
| 12 |
+
|
| 13 |
+
from project_settings import environment, project_path
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_args():
|
| 17 |
+
parser = argparse.ArgumentParser()
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--raw_dataset",
|
| 20 |
+
default=(project_path / "data/raw_dataset/agent-lingoace-zh-80-chat").as_posix(),
|
| 21 |
+
type=str
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--dataset",
|
| 25 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-80-chat.jsonl").as_posix(),
|
| 26 |
+
type=str
|
| 27 |
+
)
|
| 28 |
+
args = parser.parse_args()
|
| 29 |
+
return args
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
"""
|
| 33 |
+
<人设>
|
| 34 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
| 35 |
+
|
| 36 |
+
<工作>
|
| 37 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
| 38 |
+
|
| 39 |
+
<工作流程>
|
| 40 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
| 41 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
| 42 |
+
|
| 43 |
+
<常见用户问题的标准回答>
|
| 44 |
+
1.
|
| 45 |
+
Q: 号码是哪里来的
|
| 46 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
| 47 |
+
2.
|
| 48 |
+
Q: 询问价格
|
| 49 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
| 50 |
+
3.
|
| 51 |
+
Q: 公司/上课地点
|
| 52 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
| 53 |
+
4.
|
| 54 |
+
Q: 是否机器人
|
| 55 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
| 56 |
+
5.
|
| 57 |
+
Q: 考虑商量
|
| 58 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 59 |
+
6.
|
| 60 |
+
Q:客户问AI是否听得到
|
| 61 |
+
A:在听的,您接着说;
|
| 62 |
+
7.
|
| 63 |
+
Q:赞美
|
| 64 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
| 65 |
+
8.
|
| 66 |
+
Q:听不清楚AI说话
|
| 67 |
+
A:现在您能听清楚吗?;
|
| 68 |
+
9.
|
| 69 |
+
Q:课程教材
|
| 70 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
| 71 |
+
10.
|
| 72 |
+
Q:怎么称呼
|
| 73 |
+
A:您好,您叫我静静老师就可以。;
|
| 74 |
+
11.
|
| 75 |
+
Q:干什么的
|
| 76 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
| 77 |
+
12.
|
| 78 |
+
Q:你是骗子吗
|
| 79 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
| 80 |
+
13.
|
| 81 |
+
Q:平台名称
|
| 82 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
| 83 |
+
14.
|
| 84 |
+
Q:联系电话
|
| 85 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
| 86 |
+
15.
|
| 87 |
+
Q:询问线下
|
| 88 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
| 89 |
+
16.
|
| 90 |
+
Q:调戏
|
| 91 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
| 92 |
+
17.
|
| 93 |
+
Q:下载软件/APP
|
| 94 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
| 95 |
+
18.
|
| 96 |
+
Q:老师资质
|
| 97 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
| 98 |
+
19.
|
| 99 |
+
Q:优惠活动
|
| 100 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
| 101 |
+
20.
|
| 102 |
+
Q:课程内容
|
| 103 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
| 104 |
+
21.
|
| 105 |
+
Q:考虑商量
|
| 106 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 107 |
+
22.
|
| 108 |
+
Q:已经报班/已经上课/还有课程包
|
| 109 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
| 110 |
+
23.
|
| 111 |
+
Q:适合多大孩子学习
|
| 112 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
| 113 |
+
24.
|
| 114 |
+
Q:一节课时长
|
| 115 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
| 116 |
+
25.
|
| 117 |
+
Q:不在某地
|
| 118 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
| 119 |
+
26.
|
| 120 |
+
Q:优势及区别
|
| 121 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
| 122 |
+
27.
|
| 123 |
+
Q:有没有其他课程
|
| 124 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
| 125 |
+
28.
|
| 126 |
+
Q:家长旁听
|
| 127 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
| 128 |
+
29.
|
| 129 |
+
Q:正式报名详情
|
| 130 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
| 131 |
+
30.
|
| 132 |
+
Q:试听之后怎么收费
|
| 133 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
| 134 |
+
31.
|
| 135 |
+
Q:判断孩子基础
|
| 136 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
| 137 |
+
32.
|
| 138 |
+
Q:活动时间限制
|
| 139 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
| 140 |
+
33.
|
| 141 |
+
Q:直播还是录播
|
| 142 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
| 143 |
+
34.
|
| 144 |
+
Q:体验过了
|
| 145 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
| 146 |
+
35.
|
| 147 |
+
Q:线上不考虑
|
| 148 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
| 149 |
+
36.
|
| 150 |
+
Q:上课方式
|
| 151 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
| 152 |
+
37.
|
| 153 |
+
Q:固定老师
|
| 154 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
| 155 |
+
38.
|
| 156 |
+
Q:课程是几节课
|
| 157 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
| 158 |
+
39.
|
| 159 |
+
Q:老师是兼职的还是全职的
|
| 160 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
| 161 |
+
40.
|
| 162 |
+
Q:报了线下班
|
| 163 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
| 164 |
+
41.
|
| 165 |
+
Q:孩子不想学
|
| 166 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
| 167 |
+
|
| 168 |
+
案例:
|
| 169 |
+
[用户]: "hi"
|
| 170 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
| 171 |
+
[用户]: "你是机器人吗?"
|
| 172 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
| 173 |
+
|
| 174 |
+
<限制>
|
| 175 |
+
- 保持友好、专业和机智。
|
| 176 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
| 177 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
| 178 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
| 179 |
+
- 不要输出 “好的”,可以使用其他同义词
|
| 180 |
+
- 不要反复说“他/她”,只需要表达一次。
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def main():
|
| 185 |
+
args = get_args()
|
| 186 |
+
|
| 187 |
+
raw_dataset = Path(args.raw_dataset)
|
| 188 |
+
dataset = Path(args.dataset)
|
| 189 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
| 190 |
+
|
| 191 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
| 192 |
+
for sample_dir in raw_dataset.glob("*"):
|
| 193 |
+
idx = sample_dir.parts[-1]
|
| 194 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
| 195 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
| 196 |
+
response_file = sample_dir / "response.txt"
|
| 197 |
+
|
| 198 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 199 |
+
system_prompt = f.read()
|
| 200 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 201 |
+
user_prompt = f.read()
|
| 202 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
| 203 |
+
response = f.read()
|
| 204 |
+
|
| 205 |
+
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
|
| 206 |
+
|
| 207 |
+
print(f"{prompt}\n\n{response}")
|
| 208 |
+
print("-" * 150)
|
| 209 |
+
|
| 210 |
+
row_ = {
|
| 211 |
+
"idx": idx,
|
| 212 |
+
"prompt": prompt,
|
| 213 |
+
"response": response,
|
| 214 |
+
}
|
| 215 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 216 |
+
fout.write(f"{row_}\n")
|
| 217 |
+
|
| 218 |
+
return
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
main()
|
examples/test_metrics/lingoace_chat_metric.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 12 |
+
|
| 13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 15 |
+
|
| 16 |
+
import openai
|
| 17 |
+
from openai import AzureOpenAI
|
| 18 |
+
|
| 19 |
+
from project_settings import environment, project_path
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_args():
|
| 23 |
+
"""
|
| 24 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 25 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 26 |
+
--client "us_west(47.88.76.239)" \
|
| 27 |
+
--create_time_str 20250723_095001 \
|
| 28 |
+
--interval 10
|
| 29 |
+
|
| 30 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 31 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
| 32 |
+
--client "us_west(47.88.76.239)" \
|
| 33 |
+
--create_time_str 20250723_111000 \
|
| 34 |
+
--interval 10
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
parser = argparse.ArgumentParser()
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--model_name",
|
| 40 |
+
default="gpt-4o",
|
| 41 |
+
# default="gpt-4o-mini",
|
| 42 |
+
type=str
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--eval_data_file",
|
| 46 |
+
default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-R1-Distill-Qwen-7B/shenzhen_sase/siliconflow_api_key/20250724_135543/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
|
| 47 |
+
type=str
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--output_file",
|
| 51 |
+
default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-R1-Distill-Qwen-7B/shenzhen_sase/siliconflow_api_key/20250724_135543/agent-lingoace-zh-80-chat.jsonl").as_posix(),
|
| 52 |
+
type=str
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--service",
|
| 56 |
+
default="west_us_chatgpt_openai_azure_com",
|
| 57 |
+
type=str
|
| 58 |
+
)
|
| 59 |
+
args = parser.parse_args()
|
| 60 |
+
return args
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
task_prompt = """
|
| 64 |
+
<人设>
|
| 65 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
| 66 |
+
|
| 67 |
+
<工作>
|
| 68 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
| 69 |
+
|
| 70 |
+
<工作流程>
|
| 71 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
| 72 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
| 73 |
+
|
| 74 |
+
<常见用户问题的标准回答>
|
| 75 |
+
1.
|
| 76 |
+
Q: 号码是哪里来的
|
| 77 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
| 78 |
+
2.
|
| 79 |
+
Q: 询问价格
|
| 80 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
| 81 |
+
3.
|
| 82 |
+
Q: 公司/上课地点
|
| 83 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
| 84 |
+
4.
|
| 85 |
+
Q: 是否机器人
|
| 86 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
| 87 |
+
5.
|
| 88 |
+
Q: 考虑商量
|
| 89 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 90 |
+
6.
|
| 91 |
+
Q:客户问AI是否听得到
|
| 92 |
+
A:在听的,您接着说;
|
| 93 |
+
7.
|
| 94 |
+
Q:赞美
|
| 95 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
| 96 |
+
8.
|
| 97 |
+
Q:听不清楚AI说话
|
| 98 |
+
A:现在您能听清楚吗?;
|
| 99 |
+
9.
|
| 100 |
+
Q:课程教材
|
| 101 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
| 102 |
+
10.
|
| 103 |
+
Q:怎么称呼
|
| 104 |
+
A:您好,您叫我静静老师就可以。;
|
| 105 |
+
11.
|
| 106 |
+
Q:干什么的
|
| 107 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
| 108 |
+
12.
|
| 109 |
+
Q:你是骗子吗
|
| 110 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
| 111 |
+
13.
|
| 112 |
+
Q:平台名称
|
| 113 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
| 114 |
+
14.
|
| 115 |
+
Q:联系电话
|
| 116 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
| 117 |
+
15.
|
| 118 |
+
Q:询问线下
|
| 119 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么���题可以咨询我们专属顾问哈;
|
| 120 |
+
16.
|
| 121 |
+
Q:调戏
|
| 122 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
| 123 |
+
17.
|
| 124 |
+
Q:下载软件/APP
|
| 125 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
| 126 |
+
18.
|
| 127 |
+
Q:老师资质
|
| 128 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
| 129 |
+
19.
|
| 130 |
+
Q:优惠活动
|
| 131 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
| 132 |
+
20.
|
| 133 |
+
Q:课程内容
|
| 134 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
| 135 |
+
21.
|
| 136 |
+
Q:考虑商量
|
| 137 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 138 |
+
22.
|
| 139 |
+
Q:已经报班/已经上课/还有课程包
|
| 140 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
| 141 |
+
23.
|
| 142 |
+
Q:适合多大孩子学习
|
| 143 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
| 144 |
+
24.
|
| 145 |
+
Q:一节课时长
|
| 146 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
| 147 |
+
25.
|
| 148 |
+
Q:不在某地
|
| 149 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
| 150 |
+
26.
|
| 151 |
+
Q:优势及区别
|
| 152 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
| 153 |
+
27.
|
| 154 |
+
Q:有没有其他课程
|
| 155 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
| 156 |
+
28.
|
| 157 |
+
Q:家长旁听
|
| 158 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
| 159 |
+
29.
|
| 160 |
+
Q:正式报名详情
|
| 161 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
| 162 |
+
30.
|
| 163 |
+
Q:试听之后怎么收费
|
| 164 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
| 165 |
+
31.
|
| 166 |
+
Q:判断孩子基础
|
| 167 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
| 168 |
+
32.
|
| 169 |
+
Q:活动时间限制
|
| 170 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
| 171 |
+
33.
|
| 172 |
+
Q:直播还是录播
|
| 173 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
| 174 |
+
34.
|
| 175 |
+
Q:体验过了
|
| 176 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
| 177 |
+
35.
|
| 178 |
+
Q:线上不考虑
|
| 179 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
| 180 |
+
36.
|
| 181 |
+
Q:上课方式
|
| 182 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
| 183 |
+
37.
|
| 184 |
+
Q:固定老师
|
| 185 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
| 186 |
+
38.
|
| 187 |
+
Q:课程是几节课
|
| 188 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
| 189 |
+
39.
|
| 190 |
+
Q:老师是兼职的还是全职的
|
| 191 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
| 192 |
+
40.
|
| 193 |
+
Q:报了线下班
|
| 194 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
| 195 |
+
41.
|
| 196 |
+
Q:孩子不想学
|
| 197 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
| 198 |
+
|
| 199 |
+
案例:
|
| 200 |
+
[用户]: "hi"
|
| 201 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
| 202 |
+
[用户]: "你是机器人吗?"
|
| 203 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
| 204 |
+
|
| 205 |
+
<限制>
|
| 206 |
+
- 保持友好、专业和机智。
|
| 207 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
| 208 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
| 209 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
| 210 |
+
- 不要输出 “好的”,可以使用其他同义词
|
| 211 |
+
- 不要反复说“他/她”,只需要表达一次。
|
| 212 |
+
|
| 213 |
+
<任务>
|
| 214 |
+
以上是对约课老师的工作描述。
|
| 215 |
+
当前我们已经获得了约课老师与家长的对话记录,约课老师实际的回复。
|
| 216 |
+
现在需要你为约课老师的答复进行评分。
|
| 217 |
+
|
| 218 |
+
评分要点:
|
| 219 |
+
(1)任务相关:句子是否与对话任务相关。总分2分。
|
| 220 |
+
(2)回复相关:句子是否与家长的回复相关。总分2分。
|
| 221 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。总分1分。
|
| 222 |
+
|
| 223 |
+
<举例>
|
| 224 |
+
## 当前对话:
|
| 225 |
+
client: 你好
|
| 226 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 227 |
+
client:
|
| 228 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 229 |
+
client: 不记得。
|
| 230 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
| 231 |
+
client: 您是没听清您是谁。
|
| 232 |
+
client: 什么?我没听清你前面说的。
|
| 233 |
+
Assistant:
|
| 234 |
+
|
| 235 |
+
## 约课老师实际的句子:
|
| 236 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 237 |
+
|
| 238 |
+
## 评分:
|
| 239 |
+
{
|
| 240 |
+
"任务相关": 2,
|
| 241 |
+
"任务相关 - 解释": "约课老师的回答与任务相关。",
|
| 242 |
+
"回复相关": 2,
|
| 243 |
+
"回复相关 - 解释": "约课老师的回答正确的回应了家长的 `没听清` 与 `您是谁` 并进行了邀约任务。回应的很好。",
|
| 244 |
+
"邀约力度": 1,
|
| 245 |
+
"邀约力度 - 解释": "邀约力度符合话术要求。"
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
## 当前对话:
|
| 249 |
+
client: 你好
|
| 250 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 251 |
+
client:
|
| 252 |
+
client: 啊,我们不需要的,谢谢啊。
|
| 253 |
+
Assistant: 我们这次联合
|
| 254 |
+
client: 关了。
|
| 255 |
+
Assistant:
|
| 256 |
+
|
| 257 |
+
## 约课老师实际的句子:
|
| 258 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
| 259 |
+
|
| 260 |
+
## 评分:
|
| 261 |
+
{
|
| 262 |
+
"任务相关": 2,
|
| 263 |
+
"任务相关 - 解释": "约课老师的回答与任务相关,继续介绍了课程的免费试听机会。",
|
| 264 |
+
"回复相关": 2,
|
| 265 |
+
"回复相关 - 解释": "家长说 `关了。` 可能是ASR误识别或噪音,约课老师没有理会,继续进行邀约话术。做得很好。",
|
| 266 |
+
"邀约力度": 0.7,
|
| 267 |
+
"邀约力度 - 解释": "约课老师进行了邀约,但句子较为冗长,家长可能不耐烦。"
|
| 268 |
+
}
|
| 269 |
+
""".strip()
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
task_prompt_2 = """
|
| 273 |
+
<请处理下面这个任务>
|
| 274 |
+
## 当前对话:
|
| 275 |
+
{conversation}
|
| 276 |
+
|
| 277 |
+
## 约课老师实际的句子:
|
| 278 |
+
{response}
|
| 279 |
+
|
| 280 |
+
## 评分:
|
| 281 |
+
""".strip()
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def main():
|
| 285 |
+
args = get_args()
|
| 286 |
+
|
| 287 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
| 288 |
+
client = AzureOpenAI(
|
| 289 |
+
**service_params,
|
| 290 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
| 291 |
+
# api_version="2025-01-01-preview",
|
| 292 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
total = 0
|
| 296 |
+
total_score = 0
|
| 297 |
+
|
| 298 |
+
# finished
|
| 299 |
+
finished_idx_set = set()
|
| 300 |
+
if os.path.exists(args.output_file):
|
| 301 |
+
with open(args.output_file, "r", encoding="utf-8") as f:
|
| 302 |
+
for row in f:
|
| 303 |
+
row = json.loads(row)
|
| 304 |
+
idx = row["idx"]
|
| 305 |
+
total = row["total"]
|
| 306 |
+
total_score = row["total_score"]
|
| 307 |
+
finished_idx_set.add(idx)
|
| 308 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 309 |
+
|
| 310 |
+
with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
|
| 311 |
+
for row in fin:
|
| 312 |
+
row = json.loads(row)
|
| 313 |
+
idx = row["idx"]
|
| 314 |
+
prompt = row["prompt"]
|
| 315 |
+
response = row["response"]
|
| 316 |
+
prediction = row["prediction"]
|
| 317 |
+
time_cost = row["time_cost"]
|
| 318 |
+
|
| 319 |
+
if idx in finished_idx_set:
|
| 320 |
+
continue
|
| 321 |
+
finished_idx_set.add(idx)
|
| 322 |
+
|
| 323 |
+
conversation = prompt.split("\n\n")[-1].strip()
|
| 324 |
+
|
| 325 |
+
task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
|
| 326 |
+
# print(task_prompt_2_)
|
| 327 |
+
|
| 328 |
+
task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
|
| 329 |
+
# print(task_prompt_)
|
| 330 |
+
|
| 331 |
+
try:
|
| 332 |
+
llm_response = client.chat.completions.create(
|
| 333 |
+
model=args.model_name,
|
| 334 |
+
messages=[{"role": "user", "content": task_prompt_}],
|
| 335 |
+
stream=False,
|
| 336 |
+
top_p=0.95,
|
| 337 |
+
temperature=0.6,
|
| 338 |
+
)
|
| 339 |
+
except openai.BadRequestError as e:
|
| 340 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 341 |
+
continue
|
| 342 |
+
except openai.InternalServerError as e:
|
| 343 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 344 |
+
continue
|
| 345 |
+
|
| 346 |
+
content = llm_response.choices[0].message.content
|
| 347 |
+
try:
|
| 348 |
+
evaluate = json.loads(content)
|
| 349 |
+
except json.decoder.JSONDecodeError as e:
|
| 350 |
+
print(content)
|
| 351 |
+
raise e
|
| 352 |
+
score1 = evaluate["任务相关"]
|
| 353 |
+
score2 = evaluate["回复相关"]
|
| 354 |
+
score3 = evaluate["邀约力度"]
|
| 355 |
+
|
| 356 |
+
score1 = float(score1)
|
| 357 |
+
score2 = float(score2)
|
| 358 |
+
score3 = float(score3)
|
| 359 |
+
|
| 360 |
+
score = (score1 + score2 + score3) / 5
|
| 361 |
+
|
| 362 |
+
total += 1
|
| 363 |
+
total_score += score
|
| 364 |
+
average_score = total_score / total
|
| 365 |
+
print(f"average_score: {average_score}")
|
| 366 |
+
|
| 367 |
+
row_ = {
|
| 368 |
+
"idx": idx,
|
| 369 |
+
"prompt": prompt,
|
| 370 |
+
"response": response,
|
| 371 |
+
"prediction": prediction,
|
| 372 |
+
"time_cost": time_cost,
|
| 373 |
+
"evaluate": evaluate,
|
| 374 |
+
"total": total,
|
| 375 |
+
"score": score,
|
| 376 |
+
"total_score": total_score,
|
| 377 |
+
"average_score": average_score,
|
| 378 |
+
|
| 379 |
+
}
|
| 380 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 381 |
+
fout.write(f"{row_}\n")
|
| 382 |
+
|
| 383 |
+
return
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
if __name__ == "__main__":
|
| 387 |
+
main()
|
examples/test_metrics/lingoace_chat_metric_test.py
ADDED
|
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 11 |
+
|
| 12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 14 |
+
|
| 15 |
+
import openai
|
| 16 |
+
from openai import AzureOpenAI
|
| 17 |
+
|
| 18 |
+
from project_settings import environment, project_path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_args():
|
| 22 |
+
"""
|
| 23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 25 |
+
--client "us_west(47.88.76.239)" \
|
| 26 |
+
--create_time_str 20250723_095001 \
|
| 27 |
+
--interval 10
|
| 28 |
+
|
| 29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
| 31 |
+
--client "us_west(47.88.76.239)" \
|
| 32 |
+
--create_time_str 20250723_111000 \
|
| 33 |
+
--interval 10
|
| 34 |
+
|
| 35 |
+
"""
|
| 36 |
+
parser = argparse.ArgumentParser()
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--model_name",
|
| 39 |
+
default="gpt-4o",
|
| 40 |
+
# default="gpt-4o-mini",
|
| 41 |
+
type=str
|
| 42 |
+
)
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--service",
|
| 45 |
+
default="west_us_chatgpt_openai_azure_com",
|
| 46 |
+
type=str
|
| 47 |
+
)
|
| 48 |
+
args = parser.parse_args()
|
| 49 |
+
return args
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
prompt1 = """
|
| 53 |
+
<人设>
|
| 54 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
| 55 |
+
|
| 56 |
+
<工作>
|
| 57 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
| 58 |
+
|
| 59 |
+
<工作流程>
|
| 60 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
| 61 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
| 62 |
+
|
| 63 |
+
<常见用户问题的标准回答>
|
| 64 |
+
1.
|
| 65 |
+
Q: 号码是哪里来的
|
| 66 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
| 67 |
+
2.
|
| 68 |
+
Q: 询问价格
|
| 69 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
| 70 |
+
3.
|
| 71 |
+
Q: 公司/上课地点
|
| 72 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
| 73 |
+
4.
|
| 74 |
+
Q: 是否机器人
|
| 75 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
| 76 |
+
5.
|
| 77 |
+
Q: 考虑商量
|
| 78 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 79 |
+
6.
|
| 80 |
+
Q:客户问AI是否听得到
|
| 81 |
+
A:在听的,您接着说;
|
| 82 |
+
7.
|
| 83 |
+
Q:赞美
|
| 84 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
| 85 |
+
8.
|
| 86 |
+
Q:听不清楚AI说话
|
| 87 |
+
A:现在您能听清楚吗?;
|
| 88 |
+
9.
|
| 89 |
+
Q:课程教材
|
| 90 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
| 91 |
+
10.
|
| 92 |
+
Q:怎么称呼
|
| 93 |
+
A:您好,您叫我静静老师就可以。;
|
| 94 |
+
11.
|
| 95 |
+
Q:干什么的
|
| 96 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
| 97 |
+
12.
|
| 98 |
+
Q:你是骗子吗
|
| 99 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
| 100 |
+
13.
|
| 101 |
+
Q:平台名称
|
| 102 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
| 103 |
+
14.
|
| 104 |
+
Q:联系电话
|
| 105 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
| 106 |
+
15.
|
| 107 |
+
Q:询问线下
|
| 108 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
| 109 |
+
16.
|
| 110 |
+
Q:调戏
|
| 111 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
| 112 |
+
17.
|
| 113 |
+
Q:下载软件/APP
|
| 114 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
| 115 |
+
18.
|
| 116 |
+
Q:老师资质
|
| 117 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
| 118 |
+
19.
|
| 119 |
+
Q:优惠活动
|
| 120 |
+
A:这次帮您申请的是免费名额,您可以先体验一���,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
| 121 |
+
20.
|
| 122 |
+
Q:课程内容
|
| 123 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
| 124 |
+
21.
|
| 125 |
+
Q:考虑商量
|
| 126 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 127 |
+
22.
|
| 128 |
+
Q:已经报班/已经上课/还有课程包
|
| 129 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
| 130 |
+
23.
|
| 131 |
+
Q:适合多大孩子学习
|
| 132 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
| 133 |
+
24.
|
| 134 |
+
Q:一节课时长
|
| 135 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
| 136 |
+
25.
|
| 137 |
+
Q:不在某地
|
| 138 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
| 139 |
+
26.
|
| 140 |
+
Q:优势及区别
|
| 141 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
| 142 |
+
27.
|
| 143 |
+
Q:有没有其他课程
|
| 144 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
| 145 |
+
28.
|
| 146 |
+
Q:家长旁听
|
| 147 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
| 148 |
+
29.
|
| 149 |
+
Q:正式报名详情
|
| 150 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
| 151 |
+
30.
|
| 152 |
+
Q:试听之后怎么收费
|
| 153 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
| 154 |
+
31.
|
| 155 |
+
Q:判断孩子基础
|
| 156 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
| 157 |
+
32.
|
| 158 |
+
Q:活动时间限制
|
| 159 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
| 160 |
+
33.
|
| 161 |
+
Q:直播还是录播
|
| 162 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
| 163 |
+
34.
|
| 164 |
+
Q:体验过了
|
| 165 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
| 166 |
+
35.
|
| 167 |
+
Q:线上不考虑
|
| 168 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
| 169 |
+
36.
|
| 170 |
+
Q:上课方式
|
| 171 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
| 172 |
+
37.
|
| 173 |
+
Q:固定老师
|
| 174 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
| 175 |
+
38.
|
| 176 |
+
Q:课程是几节课
|
| 177 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
| 178 |
+
39.
|
| 179 |
+
Q:老师是兼职的还是全职的
|
| 180 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
| 181 |
+
40.
|
| 182 |
+
Q:报了线下班
|
| 183 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
| 184 |
+
41.
|
| 185 |
+
Q:孩子不想学
|
| 186 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
| 187 |
+
|
| 188 |
+
案例:
|
| 189 |
+
[用户]: "hi"
|
| 190 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
| 191 |
+
[用户]: "你是机器人吗?"
|
| 192 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
| 193 |
+
|
| 194 |
+
<限制>
|
| 195 |
+
- 保持友好、专业和机智。
|
| 196 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
| 197 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
| 198 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
| 199 |
+
- 不要输出 “好的”,可以使用其他同义词
|
| 200 |
+
- 不要反复说“他/她”,只需要表达一次。
|
| 201 |
+
|
| 202 |
+
<任务>
|
| 203 |
+
以上是对约课老师的工作描述。
|
| 204 |
+
当前我们已经获得了约课老师与家长的对话记录,一个参考的回答方案,以及约课老师实际的回复。
|
| 205 |
+
现在需要你为约课老师的答复进行评分。
|
| 206 |
+
|
| 207 |
+
评分要点:
|
| 208 |
+
(1)任务相关:句子是否与对话任务相关。2分。
|
| 209 |
+
(2)回复相关:句子是否与家长的回复相关。2分。
|
| 210 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。1分。
|
| 211 |
+
|
| 212 |
+
<举例>
|
| 213 |
+
## 当前对话:
|
| 214 |
+
client: 你好
|
| 215 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 216 |
+
client:
|
| 217 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 218 |
+
client: 不记得。
|
| 219 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
| 220 |
+
client: 您是没听清您是谁。
|
| 221 |
+
client: 什么?我没听清你前面说的。
|
| 222 |
+
Assistant:
|
| 223 |
+
|
| 224 |
+
## 参考句子:
|
| 225 |
+
抱歉让您听不清,我是LingoAce的约课老师,今天联系您是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?
|
| 226 |
+
|
| 227 |
+
## 约课老师实际的句子:
|
| 228 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 229 |
+
|
| 230 |
+
## 评分:
|
| 231 |
+
{"任务相关": 2, "回复相关": 2, "邀约力度": 1}
|
| 232 |
+
|
| 233 |
+
<请处理下面这个任务>
|
| 234 |
+
|
| 235 |
+
## 当前对话:
|
| 236 |
+
client: 你好
|
| 237 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 238 |
+
client:
|
| 239 |
+
client: 啊,我们不需要的,谢谢啊。
|
| 240 |
+
Assistant: 我们这次联合
|
| 241 |
+
client: 关了。
|
| 242 |
+
Assistant:
|
| 243 |
+
|
| 244 |
+
## 参考句子:
|
| 245 |
+
我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 246 |
+
|
| 247 |
+
## 约课老师实际的句子:
|
| 248 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
| 249 |
+
|
| 250 |
+
## 评分:
|
| 251 |
+
""".strip()
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
prompt = """
|
| 255 |
+
<人设>
|
| 256 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
| 257 |
+
|
| 258 |
+
<工作>
|
| 259 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
| 260 |
+
|
| 261 |
+
<工作流程>
|
| 262 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
| 263 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
| 264 |
+
|
| 265 |
+
<常见用户问题的标准回答>
|
| 266 |
+
1.
|
| 267 |
+
Q: 号码是哪里来的
|
| 268 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
| 269 |
+
2.
|
| 270 |
+
Q: 询问价格
|
| 271 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
| 272 |
+
3.
|
| 273 |
+
Q: 公司/上课地点
|
| 274 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
| 275 |
+
4.
|
| 276 |
+
Q: 是否机器人
|
| 277 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
| 278 |
+
5.
|
| 279 |
+
Q: 考虑商量
|
| 280 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 281 |
+
6.
|
| 282 |
+
Q:客户问AI是��听得到
|
| 283 |
+
A:在听的,您接着说;
|
| 284 |
+
7.
|
| 285 |
+
Q:赞美
|
| 286 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
| 287 |
+
8.
|
| 288 |
+
Q:听不清楚AI说话
|
| 289 |
+
A:现在您能听清楚吗?;
|
| 290 |
+
9.
|
| 291 |
+
Q:课程教材
|
| 292 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
| 293 |
+
10.
|
| 294 |
+
Q:怎么称呼
|
| 295 |
+
A:您好,您叫我静静老师就可以。;
|
| 296 |
+
11.
|
| 297 |
+
Q:干什么的
|
| 298 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
| 299 |
+
12.
|
| 300 |
+
Q:你是骗子吗
|
| 301 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
| 302 |
+
13.
|
| 303 |
+
Q:平台名称
|
| 304 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
| 305 |
+
14.
|
| 306 |
+
Q:联系电话
|
| 307 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
| 308 |
+
15.
|
| 309 |
+
Q:询问线下
|
| 310 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
| 311 |
+
16.
|
| 312 |
+
Q:调戏
|
| 313 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
| 314 |
+
17.
|
| 315 |
+
Q:下载软件/APP
|
| 316 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
| 317 |
+
18.
|
| 318 |
+
Q:老师资质
|
| 319 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
| 320 |
+
19.
|
| 321 |
+
Q:优惠活动
|
| 322 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
| 323 |
+
20.
|
| 324 |
+
Q:课程内容
|
| 325 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
| 326 |
+
21.
|
| 327 |
+
Q:考虑商量
|
| 328 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
| 329 |
+
22.
|
| 330 |
+
Q:已经报班/已经上课/还有课程包
|
| 331 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
| 332 |
+
23.
|
| 333 |
+
Q:适合多大孩子学习
|
| 334 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
| 335 |
+
24.
|
| 336 |
+
Q:一节课时长
|
| 337 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
| 338 |
+
25.
|
| 339 |
+
Q:不在某地
|
| 340 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
| 341 |
+
26.
|
| 342 |
+
Q:优势及区别
|
| 343 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
| 344 |
+
27.
|
| 345 |
+
Q:有没有其他课程
|
| 346 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
| 347 |
+
28.
|
| 348 |
+
Q:家长旁听
|
| 349 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
| 350 |
+
29.
|
| 351 |
+
Q:正式报名详情
|
| 352 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
| 353 |
+
30.
|
| 354 |
+
Q:试听之后怎么收费
|
| 355 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
| 356 |
+
31.
|
| 357 |
+
Q:判断孩子基础
|
| 358 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
| 359 |
+
32.
|
| 360 |
+
Q:活动时间限制
|
| 361 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
| 362 |
+
33.
|
| 363 |
+
Q:直播还是录播
|
| 364 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
| 365 |
+
34.
|
| 366 |
+
Q:体验过了
|
| 367 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
| 368 |
+
35.
|
| 369 |
+
Q:线上不考虑
|
| 370 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
| 371 |
+
36.
|
| 372 |
+
Q:上课方式
|
| 373 |
+
A:我们是在线的一对一授课的形式,真人老师��动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
| 374 |
+
37.
|
| 375 |
+
Q:固定老师
|
| 376 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
| 377 |
+
38.
|
| 378 |
+
Q:课程是几节课
|
| 379 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
| 380 |
+
39.
|
| 381 |
+
Q:老师是兼职的还是全职的
|
| 382 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
| 383 |
+
40.
|
| 384 |
+
Q:报了线下班
|
| 385 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
| 386 |
+
41.
|
| 387 |
+
Q:孩子不想学
|
| 388 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
| 389 |
+
|
| 390 |
+
案例:
|
| 391 |
+
[用户]: "hi"
|
| 392 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
| 393 |
+
[用户]: "你是机器人吗?"
|
| 394 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
| 395 |
+
|
| 396 |
+
<限制>
|
| 397 |
+
- 保持友好、专业和机智。
|
| 398 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
| 399 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
| 400 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
| 401 |
+
- 不要输出 “好的”,可以使用其他同义词
|
| 402 |
+
- 不要反复说“他/她”,只需要表达一次。
|
| 403 |
+
|
| 404 |
+
<任务>
|
| 405 |
+
以上是对约课老师的工作描述。
|
| 406 |
+
当前我们已经获得了约课老师与家长的对话记录,约课老师实际的回复。
|
| 407 |
+
现在需要你为约课老师的答复进行评分。
|
| 408 |
+
|
| 409 |
+
评分要点:
|
| 410 |
+
(1)任务相关:句子是否与对话任务相关。2分。
|
| 411 |
+
(2)回复相关:句子是否与家长的回复相关。2分。
|
| 412 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。1分。
|
| 413 |
+
|
| 414 |
+
<举例>
|
| 415 |
+
## 当前对话:
|
| 416 |
+
client: 你好
|
| 417 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 418 |
+
client:
|
| 419 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 420 |
+
client: 不记得。
|
| 421 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
| 422 |
+
client: 您是没听清您是谁。
|
| 423 |
+
client: 什么?我没听清你前面说的。
|
| 424 |
+
Assistant:
|
| 425 |
+
|
| 426 |
+
## 约课老师实际的句子:
|
| 427 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
| 428 |
+
|
| 429 |
+
## 评分:
|
| 430 |
+
{"任务相关": 2, "回复相关": 2, "邀约力度": 1}
|
| 431 |
+
|
| 432 |
+
<请处理下面这个任务>
|
| 433 |
+
|
| 434 |
+
## 当前对话:
|
| 435 |
+
client: 你好
|
| 436 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
| 437 |
+
client:
|
| 438 |
+
client: 啊,我们不需要的,谢谢啊。
|
| 439 |
+
Assistant: 我们这次联合
|
| 440 |
+
client: 关了。
|
| 441 |
+
Assistant:
|
| 442 |
+
|
| 443 |
+
## 约课老师实际的句子:
|
| 444 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
| 445 |
+
|
| 446 |
+
## 评分:
|
| 447 |
+
""".strip()
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def main():
|
| 451 |
+
args = get_args()
|
| 452 |
+
|
| 453 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
| 454 |
+
client = AzureOpenAI(
|
| 455 |
+
**service_params,
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
llm_response = client.chat.completions.create(
|
| 459 |
+
model=args.model_name,
|
| 460 |
+
messages=[{"role": "user", "content": prompt}],
|
| 461 |
+
stream=False,
|
| 462 |
+
# max_tokens=1,
|
| 463 |
+
top_p=0.95,
|
| 464 |
+
temperature=0.6,
|
| 465 |
+
# logit_bias={
|
| 466 |
+
# 32: 100,
|
| 467 |
+
# 33: 100,
|
| 468 |
+
# 34: 100,
|
| 469 |
+
# 35: 100,
|
| 470 |
+
# 36: 100,
|
| 471 |
+
# 37: 100,
|
| 472 |
+
# }
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
prediction = llm_response.choices[0].message.content
|
| 476 |
+
|
| 477 |
+
print(prediction)
|
| 478 |
+
return
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
if __name__ == "__main__":
|
| 482 |
+
main()
|
llm_eval_script/aws_claude.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/api-inference-examples-claude-messages-code-examples.html
|
| 5 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
|
| 6 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
|
| 7 |
+
|
| 8 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-text-completion.html
|
| 9 |
+
|
| 10 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/inference-invoke.html
|
| 11 |
+
|
| 12 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference-examples.html
|
| 13 |
+
|
| 14 |
+
"""
|
| 15 |
+
import argparse
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
import sys
|
| 21 |
+
import time
|
| 22 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 23 |
+
|
| 24 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 25 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 26 |
+
|
| 27 |
+
import boto3
|
| 28 |
+
|
| 29 |
+
from project_settings import environment, project_path
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_args():
|
| 33 |
+
"""
|
| 34 |
+
python3 aws_claude.py --model_name anthropic.claude-instant-v1 \
|
| 35 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 36 |
+
--client "us_west(47.88.76.239)" \
|
| 37 |
+
--create_time_str 20250723-interval-10 \
|
| 38 |
+
--interval 10
|
| 39 |
+
|
| 40 |
+
python3 aws_claude.py --model_name anthropic.claude-v2 \
|
| 41 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 42 |
+
--client "us_west(47.88.76.239)" \
|
| 43 |
+
--create_time_str 20250723-interval-10 \
|
| 44 |
+
--interval 10
|
| 45 |
+
|
| 46 |
+
"""
|
| 47 |
+
parser = argparse.ArgumentParser()
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--model_name",
|
| 50 |
+
# default="anthropic.claude-v2",
|
| 51 |
+
default="anthropic.claude-instant-v1",
|
| 52 |
+
|
| 53 |
+
# default="anthropic.claude-v2:1",
|
| 54 |
+
# default="anthropic.claude-instant-v1:2",
|
| 55 |
+
# default="anthropic.claude-v2:0",
|
| 56 |
+
type=str
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--eval_dataset_name",
|
| 60 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 61 |
+
# default="arc-easy-1000-choice.jsonl",
|
| 62 |
+
type=str
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--eval_dataset_dir",
|
| 66 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 67 |
+
type=str
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--eval_data_dir",
|
| 71 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 72 |
+
type=str
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
"--client",
|
| 76 |
+
default="shenzhen_sase",
|
| 77 |
+
type=str
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--service",
|
| 81 |
+
default="aws_us_east",
|
| 82 |
+
type=str
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--create_time_str",
|
| 86 |
+
default="null",
|
| 87 |
+
type=str
|
| 88 |
+
)
|
| 89 |
+
parser.add_argument(
|
| 90 |
+
"--interval",
|
| 91 |
+
default=10,
|
| 92 |
+
type=int
|
| 93 |
+
)
|
| 94 |
+
args = parser.parse_args()
|
| 95 |
+
return args
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def main():
|
| 99 |
+
args = get_args()
|
| 100 |
+
|
| 101 |
+
service = environment.get(key=args.service, dtype=json.loads)
|
| 102 |
+
aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
|
| 103 |
+
aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
|
| 104 |
+
aws_default_region = service["AWS_DEFAULT_REGION"]
|
| 105 |
+
|
| 106 |
+
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
|
| 107 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
|
| 108 |
+
os.environ["AWS_DEFAULT_REGION"] = aws_default_region
|
| 109 |
+
|
| 110 |
+
client = boto3.client(
|
| 111 |
+
service_name="bedrock-runtime",
|
| 112 |
+
region_name=aws_default_region
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 116 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 117 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 118 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 119 |
+
|
| 120 |
+
if args.create_time_str == "null":
|
| 121 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 122 |
+
now = datetime.now(tz)
|
| 123 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 124 |
+
# create_time_str = "20250722_173400"
|
| 125 |
+
else:
|
| 126 |
+
create_time_str = args.create_time_str
|
| 127 |
+
|
| 128 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 129 |
+
|
| 130 |
+
output_file = eval_data_dir / f"aws_claude/anthropic/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 131 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
|
| 133 |
+
total = 0
|
| 134 |
+
total_correct = 0
|
| 135 |
+
|
| 136 |
+
# finished
|
| 137 |
+
finished_idx_set = set()
|
| 138 |
+
if os.path.exists(output_file.as_posix()):
|
| 139 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 140 |
+
for row in f:
|
| 141 |
+
row = json.loads(row)
|
| 142 |
+
idx = row["idx"]
|
| 143 |
+
total = row["total"]
|
| 144 |
+
total_correct = row["total_correct"]
|
| 145 |
+
finished_idx_set.add(idx)
|
| 146 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 147 |
+
|
| 148 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 149 |
+
for row in fin:
|
| 150 |
+
row = json.loads(row)
|
| 151 |
+
idx = row["idx"]
|
| 152 |
+
prompt = row["prompt"]
|
| 153 |
+
response = row["response"]
|
| 154 |
+
|
| 155 |
+
if idx in finished_idx_set:
|
| 156 |
+
continue
|
| 157 |
+
finished_idx_set.add(idx)
|
| 158 |
+
|
| 159 |
+
body = {
|
| 160 |
+
"anthropic_version": "bedrock-2023-05-31",
|
| 161 |
+
"messages": [
|
| 162 |
+
{
|
| 163 |
+
"role": "user",
|
| 164 |
+
"content": [{"type": "text", "text": prompt}]
|
| 165 |
+
}
|
| 166 |
+
],
|
| 167 |
+
"max_tokens": 1000,
|
| 168 |
+
"temperature": 0.5,
|
| 169 |
+
"top_p": 0.95,
|
| 170 |
+
# "thinking": {
|
| 171 |
+
# "type": "enabled",
|
| 172 |
+
# "budget_tokens": 1
|
| 173 |
+
# },
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
# client.converse()
|
| 178 |
+
time.sleep(args.interval)
|
| 179 |
+
print(f"sleep: {args.interval}")
|
| 180 |
+
time_begin = time.time()
|
| 181 |
+
llm_response = client.invoke_model(
|
| 182 |
+
modelId=args.model_name,
|
| 183 |
+
body=json.dumps(body),
|
| 184 |
+
contentType="application/json"
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
llm_response = json.loads(llm_response["body"].read())
|
| 188 |
+
# print(result['content'][0]['text'])
|
| 189 |
+
time_cost = time.time() - time_begin
|
| 190 |
+
print(f"time_cost: {time_cost}")
|
| 191 |
+
|
| 192 |
+
except Exception as e:
|
| 193 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
prediction = llm_response["content"][0]["text"]
|
| 197 |
+
|
| 198 |
+
correct = 1 if prediction == response else 0
|
| 199 |
+
|
| 200 |
+
total += 1
|
| 201 |
+
total_correct += correct
|
| 202 |
+
score = total_correct / total
|
| 203 |
+
|
| 204 |
+
row_ = {
|
| 205 |
+
"idx": idx,
|
| 206 |
+
"prompt": prompt,
|
| 207 |
+
"response": response,
|
| 208 |
+
"prediction": prediction,
|
| 209 |
+
"correct": correct,
|
| 210 |
+
"total": total,
|
| 211 |
+
"total_correct": total_correct,
|
| 212 |
+
"score": score,
|
| 213 |
+
"time_cost": time_cost,
|
| 214 |
+
}
|
| 215 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 216 |
+
fout.write(f"{row_}\n")
|
| 217 |
+
|
| 218 |
+
return
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
main()
|
llm_eval_script/aws_claude_chat.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/api-inference-examples-claude-messages-code-examples.html
|
| 5 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
|
| 6 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
|
| 7 |
+
|
| 8 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-text-completion.html
|
| 9 |
+
|
| 10 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/inference-invoke.html
|
| 11 |
+
|
| 12 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference-examples.html
|
| 13 |
+
|
| 14 |
+
"""
|
| 15 |
+
import argparse
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
import sys
|
| 21 |
+
import time
|
| 22 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 23 |
+
|
| 24 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 25 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 26 |
+
|
| 27 |
+
import boto3
|
| 28 |
+
|
| 29 |
+
from project_settings import environment, project_path
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_args():
|
| 33 |
+
"""
|
| 34 |
+
python3 aws_claude_chat.py --model_name anthropic.claude-instant-v1 \
|
| 35 |
+
--eval_dataset_name agent-lingoace-zh-80-chat.jsonl \
|
| 36 |
+
--client "us_west(47.88.76.239)" \
|
| 37 |
+
--create_time_str 20250724-interval-1 \
|
| 38 |
+
--interval 1
|
| 39 |
+
|
| 40 |
+
python3 aws_claude_chat.py --model_name anthropic.claude-v2 \
|
| 41 |
+
--eval_dataset_name agent-lingoace-zh-80-chat.jsonl \
|
| 42 |
+
--client "us_west(47.88.76.239)" \
|
| 43 |
+
--create_time_str 20250724-interval-1 \
|
| 44 |
+
--interval 1
|
| 45 |
+
|
| 46 |
+
"""
|
| 47 |
+
parser = argparse.ArgumentParser()
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--model_name",
|
| 50 |
+
# default="anthropic.claude-v2",
|
| 51 |
+
default="anthropic.claude-instant-v1",
|
| 52 |
+
|
| 53 |
+
# default="anthropic.claude-v2:1",
|
| 54 |
+
# default="anthropic.claude-instant-v1:2",
|
| 55 |
+
# default="anthropic.claude-v2:0",
|
| 56 |
+
type=str
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--eval_dataset_name",
|
| 60 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
| 61 |
+
type=str
|
| 62 |
+
)
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--eval_dataset_dir",
|
| 65 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 66 |
+
type=str
|
| 67 |
+
)
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--eval_data_dir",
|
| 70 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 71 |
+
type=str
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--client",
|
| 75 |
+
default="shenzhen_sase",
|
| 76 |
+
type=str
|
| 77 |
+
)
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--service",
|
| 80 |
+
default="aws_us_east",
|
| 81 |
+
type=str
|
| 82 |
+
)
|
| 83 |
+
parser.add_argument(
|
| 84 |
+
"--create_time_str",
|
| 85 |
+
default="null",
|
| 86 |
+
type=str
|
| 87 |
+
)
|
| 88 |
+
parser.add_argument(
|
| 89 |
+
"--interval",
|
| 90 |
+
default=1,
|
| 91 |
+
type=int
|
| 92 |
+
)
|
| 93 |
+
args = parser.parse_args()
|
| 94 |
+
return args
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def main():
|
| 98 |
+
args = get_args()
|
| 99 |
+
|
| 100 |
+
service = environment.get(key=args.service, dtype=json.loads)
|
| 101 |
+
aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
|
| 102 |
+
aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
|
| 103 |
+
aws_default_region = service["AWS_DEFAULT_REGION"]
|
| 104 |
+
|
| 105 |
+
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
|
| 106 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
|
| 107 |
+
os.environ["AWS_DEFAULT_REGION"] = aws_default_region
|
| 108 |
+
|
| 109 |
+
client = boto3.client(
|
| 110 |
+
service_name="bedrock-runtime",
|
| 111 |
+
region_name=aws_default_region
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 115 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 116 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 117 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 118 |
+
|
| 119 |
+
if args.create_time_str == "null":
|
| 120 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 121 |
+
now = datetime.now(tz)
|
| 122 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 123 |
+
# create_time_str = "20250722_173400"
|
| 124 |
+
else:
|
| 125 |
+
create_time_str = args.create_time_str
|
| 126 |
+
|
| 127 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 128 |
+
|
| 129 |
+
output_file = eval_data_dir / f"aws_claude/anthropic/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
| 130 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 131 |
+
|
| 132 |
+
total = 0
|
| 133 |
+
|
| 134 |
+
# finished
|
| 135 |
+
finished_idx_set = set()
|
| 136 |
+
if os.path.exists(output_file.as_posix()):
|
| 137 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 138 |
+
for row in f:
|
| 139 |
+
row = json.loads(row)
|
| 140 |
+
idx = row["idx"]
|
| 141 |
+
total = row["total"]
|
| 142 |
+
finished_idx_set.add(idx)
|
| 143 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 144 |
+
|
| 145 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 146 |
+
for row in fin:
|
| 147 |
+
row = json.loads(row)
|
| 148 |
+
idx = row["idx"]
|
| 149 |
+
prompt = row["prompt"]
|
| 150 |
+
response = row["response"]
|
| 151 |
+
|
| 152 |
+
if idx in finished_idx_set:
|
| 153 |
+
continue
|
| 154 |
+
finished_idx_set.add(idx)
|
| 155 |
+
|
| 156 |
+
body = {
|
| 157 |
+
"anthropic_version": "bedrock-2023-05-31",
|
| 158 |
+
"messages": [
|
| 159 |
+
{
|
| 160 |
+
"role": "user",
|
| 161 |
+
"content": [{"type": "text", "text": prompt}]
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"max_tokens": 1000,
|
| 165 |
+
"temperature": 0.5,
|
| 166 |
+
"top_p": 0.95,
|
| 167 |
+
"stop_sequences": ["client"],
|
| 168 |
+
# "thinking": {
|
| 169 |
+
# "type": "enabled",
|
| 170 |
+
# "budget_tokens": 1
|
| 171 |
+
# },
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
# client.converse()
|
| 176 |
+
time.sleep(args.interval)
|
| 177 |
+
print(f"sleep: {args.interval}")
|
| 178 |
+
time_begin = time.time()
|
| 179 |
+
llm_response = client.invoke_model(
|
| 180 |
+
modelId=args.model_name,
|
| 181 |
+
body=json.dumps(body),
|
| 182 |
+
contentType="application/json"
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
llm_response = json.loads(llm_response["body"].read())
|
| 186 |
+
# print(result['content'][0]['text'])
|
| 187 |
+
time_cost = time.time() - time_begin
|
| 188 |
+
print(f"time_cost: {time_cost}")
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
prediction = llm_response["content"][0]["text"]
|
| 195 |
+
|
| 196 |
+
total += 1
|
| 197 |
+
|
| 198 |
+
row_ = {
|
| 199 |
+
"idx": idx,
|
| 200 |
+
"prompt": prompt,
|
| 201 |
+
"response": response,
|
| 202 |
+
"prediction": prediction,
|
| 203 |
+
"total": total,
|
| 204 |
+
"time_cost": time_cost,
|
| 205 |
+
}
|
| 206 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 207 |
+
fout.write(f"{row_}\n")
|
| 208 |
+
|
| 209 |
+
return
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
if __name__ == "__main__":
|
| 213 |
+
main()
|
llm_eval_script/azure_openai.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 11 |
+
|
| 12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 14 |
+
|
| 15 |
+
import openai
|
| 16 |
+
from openai import AzureOpenAI
|
| 17 |
+
|
| 18 |
+
from project_settings import environment, project_path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_args():
|
| 22 |
+
"""
|
| 23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 25 |
+
--client "us_west(47.88.76.239)" \
|
| 26 |
+
--create_time_str 20250723_095001 \
|
| 27 |
+
--interval 10
|
| 28 |
+
|
| 29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
| 31 |
+
--client "us_west(47.88.76.239)" \
|
| 32 |
+
--create_time_str 20250723_111000 \
|
| 33 |
+
--interval 10
|
| 34 |
+
|
| 35 |
+
"""
|
| 36 |
+
parser = argparse.ArgumentParser()
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--model_name",
|
| 39 |
+
default="gpt-4o",
|
| 40 |
+
# default="gpt-4o-mini",
|
| 41 |
+
type=str
|
| 42 |
+
)
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--eval_dataset_name",
|
| 45 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 46 |
+
# default="arc-easy-1000-choice.jsonl",
|
| 47 |
+
type=str
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--eval_dataset_dir",
|
| 51 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 52 |
+
type=str
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--eval_data_dir",
|
| 56 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 57 |
+
type=str
|
| 58 |
+
)
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
"--client",
|
| 61 |
+
default="shenzhen_sase",
|
| 62 |
+
type=str
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--service",
|
| 66 |
+
default="west_us_chatgpt_openai_azure_com",
|
| 67 |
+
type=str
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--create_time_str",
|
| 71 |
+
default="null",
|
| 72 |
+
type=str
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
"--interval",
|
| 76 |
+
default=10,
|
| 77 |
+
type=int
|
| 78 |
+
)
|
| 79 |
+
args = parser.parse_args()
|
| 80 |
+
return args
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def main():
|
| 84 |
+
args = get_args()
|
| 85 |
+
|
| 86 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 87 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 88 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 89 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
if args.create_time_str == "null":
|
| 92 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 93 |
+
now = datetime.now(tz)
|
| 94 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 95 |
+
# create_time_str = "20250722_173400"
|
| 96 |
+
else:
|
| 97 |
+
create_time_str = args.create_time_str
|
| 98 |
+
|
| 99 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 100 |
+
|
| 101 |
+
output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 102 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
|
| 104 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
| 105 |
+
client = AzureOpenAI(
|
| 106 |
+
**service_params,
|
| 107 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
| 108 |
+
# api_version="2025-01-01-preview",
|
| 109 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
total = 0
|
| 113 |
+
total_correct = 0
|
| 114 |
+
|
| 115 |
+
# finished
|
| 116 |
+
finished_idx_set = set()
|
| 117 |
+
if os.path.exists(output_file.as_posix()):
|
| 118 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 119 |
+
for row in f:
|
| 120 |
+
row = json.loads(row)
|
| 121 |
+
idx = row["idx"]
|
| 122 |
+
total = row["total"]
|
| 123 |
+
total_correct = row["total_correct"]
|
| 124 |
+
finished_idx_set.add(idx)
|
| 125 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 126 |
+
|
| 127 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 128 |
+
for row in fin:
|
| 129 |
+
row = json.loads(row)
|
| 130 |
+
idx = row["idx"]
|
| 131 |
+
prompt = row["prompt"]
|
| 132 |
+
response = row["response"]
|
| 133 |
+
|
| 134 |
+
if idx in finished_idx_set:
|
| 135 |
+
continue
|
| 136 |
+
finished_idx_set.add(idx)
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
time.sleep(args.interval)
|
| 140 |
+
print(f"sleep: {args.interval}")
|
| 141 |
+
time_begin = time.time()
|
| 142 |
+
llm_response = client.chat.completions.create(
|
| 143 |
+
model=args.model_name,
|
| 144 |
+
messages=[{"role": "user", "content": prompt}],
|
| 145 |
+
stream=False,
|
| 146 |
+
max_tokens=1,
|
| 147 |
+
top_p=0.95,
|
| 148 |
+
temperature=0.6,
|
| 149 |
+
logit_bias={
|
| 150 |
+
32: 100,
|
| 151 |
+
33: 100,
|
| 152 |
+
34: 100,
|
| 153 |
+
35: 100,
|
| 154 |
+
36: 100,
|
| 155 |
+
37: 100,
|
| 156 |
+
38: 100,
|
| 157 |
+
39: 100,
|
| 158 |
+
}
|
| 159 |
+
)
|
| 160 |
+
time_cost = time.time() - time_begin
|
| 161 |
+
print(f"time_cost: {time_cost}")
|
| 162 |
+
except openai.BadRequestError as e:
|
| 163 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
prediction = llm_response.choices[0].message.content
|
| 167 |
+
|
| 168 |
+
correct = 1 if prediction == response else 0
|
| 169 |
+
|
| 170 |
+
total += 1
|
| 171 |
+
total_correct += correct
|
| 172 |
+
score = total_correct / total
|
| 173 |
+
|
| 174 |
+
row_ = {
|
| 175 |
+
"idx": idx,
|
| 176 |
+
"prompt": prompt,
|
| 177 |
+
"response": response,
|
| 178 |
+
"prediction": prediction,
|
| 179 |
+
"correct": correct,
|
| 180 |
+
"total": total,
|
| 181 |
+
"total_correct": total_correct,
|
| 182 |
+
"score": score,
|
| 183 |
+
"time_cost": time_cost,
|
| 184 |
+
}
|
| 185 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 186 |
+
fout.write(f"{row_}\n")
|
| 187 |
+
|
| 188 |
+
return
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
if __name__ == "__main__":
|
| 192 |
+
main()
|
llm_eval_script/azure_openai_chat.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 11 |
+
|
| 12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 14 |
+
|
| 15 |
+
import openai
|
| 16 |
+
from openai import AzureOpenAI
|
| 17 |
+
|
| 18 |
+
from project_settings import environment, project_path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_args():
|
| 22 |
+
"""
|
| 23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 25 |
+
--client "us_west(47.88.76.239)" \
|
| 26 |
+
--create_time_str 20250723_095001 \
|
| 27 |
+
--interval 10
|
| 28 |
+
|
| 29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
| 30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
| 31 |
+
--client "us_west(47.88.76.239)" \
|
| 32 |
+
--create_time_str 20250723_111000 \
|
| 33 |
+
--interval 10
|
| 34 |
+
|
| 35 |
+
"""
|
| 36 |
+
parser = argparse.ArgumentParser()
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--model_name",
|
| 39 |
+
default="gpt-4o",
|
| 40 |
+
# default="gpt-4o-mini",
|
| 41 |
+
type=str
|
| 42 |
+
)
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--eval_dataset_name",
|
| 45 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
| 46 |
+
type=str
|
| 47 |
+
)
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--eval_dataset_dir",
|
| 50 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 51 |
+
type=str
|
| 52 |
+
)
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--eval_data_dir",
|
| 55 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 56 |
+
type=str
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--client",
|
| 60 |
+
default="shenzhen_sase",
|
| 61 |
+
type=str
|
| 62 |
+
)
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--service",
|
| 65 |
+
default="west_us_chatgpt_openai_azure_com",
|
| 66 |
+
type=str
|
| 67 |
+
)
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--create_time_str",
|
| 70 |
+
default="null",
|
| 71 |
+
type=str
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--interval",
|
| 75 |
+
default=0,
|
| 76 |
+
type=int
|
| 77 |
+
)
|
| 78 |
+
args = parser.parse_args()
|
| 79 |
+
return args
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def main():
|
| 83 |
+
args = get_args()
|
| 84 |
+
|
| 85 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 86 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 88 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 89 |
+
|
| 90 |
+
if args.create_time_str == "null":
|
| 91 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 92 |
+
now = datetime.now(tz)
|
| 93 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 94 |
+
# create_time_str = "20250722_173400"
|
| 95 |
+
else:
|
| 96 |
+
create_time_str = args.create_time_str
|
| 97 |
+
|
| 98 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 99 |
+
|
| 100 |
+
output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
| 101 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 102 |
+
|
| 103 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
| 104 |
+
client = AzureOpenAI(
|
| 105 |
+
**service_params,
|
| 106 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
| 107 |
+
# api_version="2025-01-01-preview",
|
| 108 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
total = 0
|
| 112 |
+
|
| 113 |
+
# finished
|
| 114 |
+
finished_idx_set = set()
|
| 115 |
+
if os.path.exists(output_file.as_posix()):
|
| 116 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 117 |
+
for row in f:
|
| 118 |
+
row = json.loads(row)
|
| 119 |
+
idx = row["idx"]
|
| 120 |
+
total = row["total"]
|
| 121 |
+
finished_idx_set.add(idx)
|
| 122 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 123 |
+
|
| 124 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 125 |
+
for row in fin:
|
| 126 |
+
row = json.loads(row)
|
| 127 |
+
idx = row["idx"]
|
| 128 |
+
prompt = row["prompt"]
|
| 129 |
+
response = row["response"]
|
| 130 |
+
|
| 131 |
+
if idx in finished_idx_set:
|
| 132 |
+
continue
|
| 133 |
+
finished_idx_set.add(idx)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
time.sleep(args.interval)
|
| 137 |
+
print(f"sleep: {args.interval}")
|
| 138 |
+
time_begin = time.time()
|
| 139 |
+
llm_response = client.chat.completions.create(
|
| 140 |
+
model=args.model_name,
|
| 141 |
+
messages=[{"role": "user", "content": prompt}],
|
| 142 |
+
stream=False,
|
| 143 |
+
# max_tokens=1,
|
| 144 |
+
top_p=0.95,
|
| 145 |
+
temperature=0.6,
|
| 146 |
+
# logit_bias={
|
| 147 |
+
# 32: 100,
|
| 148 |
+
# 33: 100,
|
| 149 |
+
# 34: 100,
|
| 150 |
+
# 35: 100,
|
| 151 |
+
# 36: 100,
|
| 152 |
+
# 37: 100,
|
| 153 |
+
# }
|
| 154 |
+
)
|
| 155 |
+
time_cost = time.time() - time_begin
|
| 156 |
+
print(f"time_cost: {time_cost}")
|
| 157 |
+
except openai.BadRequestError as e:
|
| 158 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 159 |
+
continue
|
| 160 |
+
except openai.InternalServerError as e:
|
| 161 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
prediction = llm_response.choices[0].message.content
|
| 165 |
+
|
| 166 |
+
total += 1
|
| 167 |
+
|
| 168 |
+
row_ = {
|
| 169 |
+
"idx": idx,
|
| 170 |
+
"prompt": prompt,
|
| 171 |
+
"response": response,
|
| 172 |
+
"prediction": prediction,
|
| 173 |
+
"total": total,
|
| 174 |
+
"time_cost": time_cost,
|
| 175 |
+
}
|
| 176 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 177 |
+
fout.write(f"{row_}\n")
|
| 178 |
+
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__ == "__main__":
|
| 183 |
+
main()
|
llm_eval_script/byteplus.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://docs.byteplus.com/en/docs/ModelArk/1099455
|
| 5 |
+
|
| 6 |
+
model list
|
| 7 |
+
https://docs.byteplus.com/en/docs/ModelArk/1330310
|
| 8 |
+
"""
|
| 9 |
+
import argparse
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import sys
|
| 15 |
+
import time
|
| 16 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 17 |
+
|
| 18 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 19 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 20 |
+
|
| 21 |
+
from openai import OpenAI
|
| 22 |
+
|
| 23 |
+
from project_settings import environment, project_path
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_args():
|
| 27 |
+
parser = argparse.ArgumentParser()
|
| 28 |
+
parser.add_argument(
|
| 29 |
+
"--model_name",
|
| 30 |
+
default="seedance-1-0-lite-t2v-250428",
|
| 31 |
+
type=str
|
| 32 |
+
)
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--eval_dataset_name",
|
| 35 |
+
default="arc-easy-1000-choice.jsonl",
|
| 36 |
+
type=str
|
| 37 |
+
)
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--eval_dataset_dir",
|
| 40 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 41 |
+
type=str
|
| 42 |
+
)
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--eval_data_dir",
|
| 45 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 46 |
+
type=str
|
| 47 |
+
)
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--client",
|
| 50 |
+
default="shenzhen_sase",
|
| 51 |
+
type=str
|
| 52 |
+
)
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--service",
|
| 55 |
+
default="byteplus_api_key",
|
| 56 |
+
type=str
|
| 57 |
+
)
|
| 58 |
+
args = parser.parse_args()
|
| 59 |
+
return args
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main():
|
| 63 |
+
args = get_args()
|
| 64 |
+
|
| 65 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 66 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 68 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
|
| 70 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 71 |
+
now = datetime.now(tz)
|
| 72 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 73 |
+
|
| 74 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 75 |
+
|
| 76 |
+
output_file = eval_data_dir / f"byteplus/byteplus/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 77 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
|
| 79 |
+
api_key = environment.get(args.service, dtype=str)
|
| 80 |
+
client = OpenAI(
|
| 81 |
+
base_url="https://ark.ap-southeast.bytepluses.com/api/v3",
|
| 82 |
+
# Read your Ark API Key from the environment variable.
|
| 83 |
+
api_key=api_key
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
total = 0
|
| 87 |
+
total_correct = 0
|
| 88 |
+
|
| 89 |
+
# finished
|
| 90 |
+
finished_idx_set = set()
|
| 91 |
+
if os.path.exists(output_file.as_posix()):
|
| 92 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 93 |
+
for row in f:
|
| 94 |
+
row = json.loads(row)
|
| 95 |
+
idx = row["idx"]
|
| 96 |
+
total = row["total"]
|
| 97 |
+
total_correct = row["total_correct"]
|
| 98 |
+
finished_idx_set.add(idx)
|
| 99 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 100 |
+
|
| 101 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 102 |
+
for row in fin:
|
| 103 |
+
row = json.loads(row)
|
| 104 |
+
idx = row["idx"]
|
| 105 |
+
prompt = row["prompt"]
|
| 106 |
+
response = row["response"]
|
| 107 |
+
|
| 108 |
+
if idx in finished_idx_set:
|
| 109 |
+
continue
|
| 110 |
+
finished_idx_set.add(idx)
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
time_begin = time.time()
|
| 114 |
+
completion = client.chat.completions.create(
|
| 115 |
+
# Replace with your Inference Endpoint.
|
| 116 |
+
model=args.model_name,
|
| 117 |
+
messages=[
|
| 118 |
+
{"role": "user", "content": prompt},
|
| 119 |
+
]
|
| 120 |
+
)
|
| 121 |
+
time_cost = time.time() - time_begin
|
| 122 |
+
print(f"time_cost: {time_cost}")
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
prediction = completion.choices[0].message.content
|
| 128 |
+
|
| 129 |
+
correct = 1 if prediction == response else 0
|
| 130 |
+
|
| 131 |
+
total += 1
|
| 132 |
+
total_correct += correct
|
| 133 |
+
score = total_correct / total
|
| 134 |
+
|
| 135 |
+
row_ = {
|
| 136 |
+
"idx": idx,
|
| 137 |
+
"prompt": prompt,
|
| 138 |
+
"response": response,
|
| 139 |
+
"prediction": prediction,
|
| 140 |
+
"correct": correct,
|
| 141 |
+
"total": total,
|
| 142 |
+
"total_correct": total_correct,
|
| 143 |
+
"score": score,
|
| 144 |
+
"time_cost": time_cost,
|
| 145 |
+
}
|
| 146 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 147 |
+
fout.write(f"{row_}\n")
|
| 148 |
+
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
main()
|
llm_eval_script/gemini_google.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
import tempfile
|
| 11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 12 |
+
|
| 13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 15 |
+
|
| 16 |
+
from google import genai
|
| 17 |
+
from google.genai import types
|
| 18 |
+
|
| 19 |
+
from project_settings import environment, project_path
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_args():
|
| 23 |
+
parser = argparse.ArgumentParser()
|
| 24 |
+
parser.add_argument(
|
| 25 |
+
"--model_name",
|
| 26 |
+
default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
|
| 27 |
+
# default="gemini-2.5-flash",
|
| 28 |
+
# default="gemini-2.5-flash-lite-preview-06-17",
|
| 29 |
+
type=str
|
| 30 |
+
)
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--eval_dataset_name",
|
| 33 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 34 |
+
# default="arc-easy-1000-choice.jsonl",
|
| 35 |
+
type=str
|
| 36 |
+
)
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--eval_dataset_dir",
|
| 39 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 40 |
+
type=str
|
| 41 |
+
)
|
| 42 |
+
parser.add_argument(
|
| 43 |
+
"--eval_data_dir",
|
| 44 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 45 |
+
type=str
|
| 46 |
+
)
|
| 47 |
+
parser.add_argument(
|
| 48 |
+
"--client",
|
| 49 |
+
default="shenzhen_sase",
|
| 50 |
+
type=str
|
| 51 |
+
)
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--service",
|
| 54 |
+
default="google_potent_veld_462405_t3",
|
| 55 |
+
type=str
|
| 56 |
+
)
|
| 57 |
+
args = parser.parse_args()
|
| 58 |
+
return args
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def main():
|
| 62 |
+
args = get_args()
|
| 63 |
+
|
| 64 |
+
service = environment.get(args.service, dtype=json.loads)
|
| 65 |
+
project_id = service["project_id"]
|
| 66 |
+
|
| 67 |
+
google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
|
| 68 |
+
google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
|
| 70 |
+
with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
|
| 71 |
+
content = json.dumps(service, ensure_ascii=False, indent=4)
|
| 72 |
+
f.write(f"{content}\n")
|
| 73 |
+
|
| 74 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
|
| 75 |
+
|
| 76 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 77 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 79 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
|
| 81 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 82 |
+
now = datetime.now(tz)
|
| 83 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 84 |
+
|
| 85 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 86 |
+
|
| 87 |
+
output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 88 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 89 |
+
|
| 90 |
+
client = genai.Client(
|
| 91 |
+
vertexai=True,
|
| 92 |
+
project=project_id,
|
| 93 |
+
location="global",
|
| 94 |
+
)
|
| 95 |
+
generate_content_config = types.GenerateContentConfig(
|
| 96 |
+
top_p=0.95,
|
| 97 |
+
temperature=0.6,
|
| 98 |
+
max_output_tokens=1,
|
| 99 |
+
response_modalities=["TEXT"],
|
| 100 |
+
thinking_config=types.ThinkingConfig(
|
| 101 |
+
thinking_budget=0
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
total = 0
|
| 106 |
+
total_correct = 0
|
| 107 |
+
|
| 108 |
+
# finished
|
| 109 |
+
finished_idx_set = set()
|
| 110 |
+
if os.path.exists(output_file.as_posix()):
|
| 111 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 112 |
+
for row in f:
|
| 113 |
+
row = json.loads(row)
|
| 114 |
+
idx = row["idx"]
|
| 115 |
+
total = row["total"]
|
| 116 |
+
total_correct = row["total_correct"]
|
| 117 |
+
finished_idx_set.add(idx)
|
| 118 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 119 |
+
|
| 120 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 121 |
+
for row in fin:
|
| 122 |
+
row = json.loads(row)
|
| 123 |
+
idx = row["idx"]
|
| 124 |
+
prompt = row["prompt"]
|
| 125 |
+
response = row["response"]
|
| 126 |
+
|
| 127 |
+
if idx in finished_idx_set:
|
| 128 |
+
continue
|
| 129 |
+
finished_idx_set.add(idx)
|
| 130 |
+
|
| 131 |
+
contents = [
|
| 132 |
+
types.Content(
|
| 133 |
+
role="user",
|
| 134 |
+
parts=[
|
| 135 |
+
types.Part.from_text(text=prompt)
|
| 136 |
+
]
|
| 137 |
+
)
|
| 138 |
+
]
|
| 139 |
+
time_begin = time.time()
|
| 140 |
+
llm_response: types.GenerateContentResponse = client.models.generate_content(
|
| 141 |
+
model=args.model_name,
|
| 142 |
+
contents=contents,
|
| 143 |
+
config=generate_content_config,
|
| 144 |
+
)
|
| 145 |
+
time_cost = time.time() - time_begin
|
| 146 |
+
print(f"time_cost: {time_cost}")
|
| 147 |
+
try:
|
| 148 |
+
prediction = llm_response.candidates[0].content.parts[0].text
|
| 149 |
+
except TypeError as e:
|
| 150 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 151 |
+
continue
|
| 152 |
+
correct = 1 if prediction == response else 0
|
| 153 |
+
|
| 154 |
+
total += 1
|
| 155 |
+
total_correct += correct
|
| 156 |
+
score = total_correct / total
|
| 157 |
+
|
| 158 |
+
row_ = {
|
| 159 |
+
"idx": idx,
|
| 160 |
+
"prompt": prompt,
|
| 161 |
+
"response": response,
|
| 162 |
+
"prediction": prediction,
|
| 163 |
+
"correct": correct,
|
| 164 |
+
"total": total,
|
| 165 |
+
"total_correct": total_correct,
|
| 166 |
+
"score": score,
|
| 167 |
+
"time_cost": time_cost,
|
| 168 |
+
}
|
| 169 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 170 |
+
fout.write(f"{row_}\n")
|
| 171 |
+
|
| 172 |
+
return
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
main()
|
llm_eval_script/gemini_google_chat.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
import tempfile
|
| 11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 12 |
+
|
| 13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 15 |
+
|
| 16 |
+
from google import genai
|
| 17 |
+
from google.genai import types
|
| 18 |
+
|
| 19 |
+
from project_settings import environment, project_path
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_args():
|
| 23 |
+
parser = argparse.ArgumentParser()
|
| 24 |
+
parser.add_argument(
|
| 25 |
+
"--model_name",
|
| 26 |
+
# default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
|
| 27 |
+
default="gemini-2.5-flash",
|
| 28 |
+
# default="gemini-2.5-flash-lite-preview-06-17",
|
| 29 |
+
type=str
|
| 30 |
+
)
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--eval_dataset_name",
|
| 33 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
| 34 |
+
type=str
|
| 35 |
+
)
|
| 36 |
+
parser.add_argument(
|
| 37 |
+
"--eval_dataset_dir",
|
| 38 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 39 |
+
type=str
|
| 40 |
+
)
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--eval_data_dir",
|
| 43 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 44 |
+
type=str
|
| 45 |
+
)
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
"--client",
|
| 48 |
+
default="shenzhen_sase",
|
| 49 |
+
type=str
|
| 50 |
+
)
|
| 51 |
+
parser.add_argument(
|
| 52 |
+
"--service",
|
| 53 |
+
default="google_potent_veld_462405_t3",
|
| 54 |
+
type=str
|
| 55 |
+
)
|
| 56 |
+
args = parser.parse_args()
|
| 57 |
+
return args
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def main():
|
| 61 |
+
args = get_args()
|
| 62 |
+
|
| 63 |
+
service = environment.get(args.service, dtype=json.loads)
|
| 64 |
+
project_id = service["project_id"]
|
| 65 |
+
|
| 66 |
+
google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
|
| 67 |
+
google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
|
| 70 |
+
content = json.dumps(service, ensure_ascii=False, indent=4)
|
| 71 |
+
f.write(f"{content}\n")
|
| 72 |
+
|
| 73 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
|
| 74 |
+
|
| 75 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 76 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 77 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 78 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 79 |
+
|
| 80 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 81 |
+
now = datetime.now(tz)
|
| 82 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 83 |
+
|
| 84 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 85 |
+
|
| 86 |
+
output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
| 87 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 88 |
+
|
| 89 |
+
client = genai.Client(
|
| 90 |
+
vertexai=True,
|
| 91 |
+
project=project_id,
|
| 92 |
+
location="global",
|
| 93 |
+
)
|
| 94 |
+
generate_content_config = types.GenerateContentConfig(
|
| 95 |
+
top_p=0.95,
|
| 96 |
+
temperature=0.6,
|
| 97 |
+
# max_output_tokens=1,
|
| 98 |
+
response_modalities=["TEXT"],
|
| 99 |
+
thinking_config=types.ThinkingConfig(
|
| 100 |
+
thinking_budget=0
|
| 101 |
+
)
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
total = 0
|
| 105 |
+
|
| 106 |
+
# finished
|
| 107 |
+
finished_idx_set = set()
|
| 108 |
+
if os.path.exists(output_file.as_posix()):
|
| 109 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 110 |
+
for row in f:
|
| 111 |
+
row = json.loads(row)
|
| 112 |
+
idx = row["idx"]
|
| 113 |
+
total = row["total"]
|
| 114 |
+
finished_idx_set.add(idx)
|
| 115 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 116 |
+
|
| 117 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 118 |
+
for row in fin:
|
| 119 |
+
row = json.loads(row)
|
| 120 |
+
idx = row["idx"]
|
| 121 |
+
prompt = row["prompt"]
|
| 122 |
+
response = row["response"]
|
| 123 |
+
|
| 124 |
+
if idx in finished_idx_set:
|
| 125 |
+
continue
|
| 126 |
+
finished_idx_set.add(idx)
|
| 127 |
+
|
| 128 |
+
contents = [
|
| 129 |
+
types.Content(
|
| 130 |
+
role="user",
|
| 131 |
+
parts=[
|
| 132 |
+
types.Part.from_text(text=prompt)
|
| 133 |
+
]
|
| 134 |
+
)
|
| 135 |
+
]
|
| 136 |
+
time_begin = time.time()
|
| 137 |
+
llm_response: types.GenerateContentResponse = client.models.generate_content(
|
| 138 |
+
model=args.model_name,
|
| 139 |
+
contents=contents,
|
| 140 |
+
config=generate_content_config,
|
| 141 |
+
)
|
| 142 |
+
time_cost = time.time() - time_begin
|
| 143 |
+
print(f"time_cost: {time_cost}")
|
| 144 |
+
try:
|
| 145 |
+
prediction = llm_response.candidates[0].content.parts[0].text
|
| 146 |
+
except TypeError as e:
|
| 147 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
total += 1
|
| 151 |
+
|
| 152 |
+
row_ = {
|
| 153 |
+
"idx": idx,
|
| 154 |
+
"prompt": prompt,
|
| 155 |
+
"response": response,
|
| 156 |
+
"prediction": prediction,
|
| 157 |
+
"total": total,
|
| 158 |
+
"time_cost": time_cost,
|
| 159 |
+
}
|
| 160 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 161 |
+
fout.write(f"{row_}\n")
|
| 162 |
+
|
| 163 |
+
return
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
if __name__ == "__main__":
|
| 167 |
+
main()
|
llm_eval_script/ollama.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 10 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 11 |
+
|
| 12 |
+
import openai
|
| 13 |
+
from openai import OpenAI
|
| 14 |
+
import pandas as pd
|
| 15 |
+
|
| 16 |
+
from project_settings import environment, project_path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_args():
|
| 20 |
+
parser = argparse.ArgumentParser()
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--dataset_dir",
|
| 23 |
+
default=(project_path / "data/llm-log/eval-llm-dataset/choice").as_posix(),
|
| 24 |
+
type=str
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
parser.add_argument("--ollama_host", default="10.75.27.247", type=str)
|
| 28 |
+
parser.add_argument("--ollama_port", default=11434, type=int)
|
| 29 |
+
|
| 30 |
+
parser.add_argument("--model_id",
|
| 31 |
+
# default="qwen2:7b-instruct-fp16",
|
| 32 |
+
# default="llama3:8b-instruct-fp16",
|
| 33 |
+
# default="gemma3:4b",
|
| 34 |
+
default="gemma3:1b",
|
| 35 |
+
type=str)
|
| 36 |
+
|
| 37 |
+
parser.add_argument("--api_key", default="ollama", type=str)
|
| 38 |
+
|
| 39 |
+
args = parser.parse_args()
|
| 40 |
+
return args
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
args = get_args()
|
| 45 |
+
|
| 46 |
+
dataset_dir = Path(args.dataset_dir)
|
| 47 |
+
|
| 48 |
+
base_url = f"http://{args.ollama_host}:{args.ollama_port}/v1"
|
| 49 |
+
|
| 50 |
+
client = OpenAI(
|
| 51 |
+
api_key=args.api_key,
|
| 52 |
+
base_url=base_url,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
result = list()
|
| 56 |
+
count = 0
|
| 57 |
+
for sample_dir in dataset_dir.glob("*"):
|
| 58 |
+
if count >= 1000:
|
| 59 |
+
break
|
| 60 |
+
print(f"count: {count}, process: {sample_dir.as_posix()}")
|
| 61 |
+
count += 1
|
| 62 |
+
system_prompt_file = sample_dir / f"system_prompt.txt"
|
| 63 |
+
user_prompt_file = sample_dir / f"user_prompt.txt"
|
| 64 |
+
response_file = sample_dir / f"response.txt"
|
| 65 |
+
|
| 66 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 67 |
+
system_prompt = f.read()
|
| 68 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 69 |
+
user_prompt = f.read()
|
| 70 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
| 71 |
+
response = f.read()
|
| 72 |
+
|
| 73 |
+
completion = client.chat.completions.create(
|
| 74 |
+
model=args.model_id,
|
| 75 |
+
messages=[
|
| 76 |
+
{"role": "system", "content": system_prompt},
|
| 77 |
+
{"role": "user", "content": user_prompt},
|
| 78 |
+
],
|
| 79 |
+
temperature=0.8,
|
| 80 |
+
top_p=0.8
|
| 81 |
+
)
|
| 82 |
+
js = completion.model_dump_json()
|
| 83 |
+
js = json.loads(js)
|
| 84 |
+
predict = js["choices"][0]["message"]["content"]
|
| 85 |
+
|
| 86 |
+
row = {
|
| 87 |
+
"system_prompt": system_prompt,
|
| 88 |
+
"user_prompt": user_prompt,
|
| 89 |
+
"response": response,
|
| 90 |
+
"predict": predict.strip(),
|
| 91 |
+
|
| 92 |
+
}
|
| 93 |
+
result.append(row)
|
| 94 |
+
|
| 95 |
+
df = pd.DataFrame(result)
|
| 96 |
+
df.to_excel("result.xlsx", index=False)
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
llm_eval_script/siliconflow.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
|
| 5 |
+
|
| 6 |
+
https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
|
| 7 |
+
|
| 8 |
+
Model Name:
|
| 9 |
+
Pro/deepseek-ai/DeepSeek-R1
|
| 10 |
+
Tips:
|
| 11 |
+
(1)thinking_budget: Must be greater than or equal to 1
|
| 12 |
+
(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
|
| 13 |
+
|
| 14 |
+
Model Name:
|
| 15 |
+
tencent/Hunyuan-A13B-Instruct
|
| 16 |
+
Tips:
|
| 17 |
+
(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
|
| 18 |
+
max_tokens=4,
|
| 19 |
+
logit_bias={
|
| 20 |
+
32: 100,
|
| 21 |
+
33: 100,
|
| 22 |
+
34: 100,
|
| 23 |
+
35: 100,
|
| 24 |
+
36: 100,
|
| 25 |
+
37: 100,
|
| 26 |
+
},
|
| 27 |
+
|
| 28 |
+
Model Name:
|
| 29 |
+
deepseek-ai/DeepSeek-R1
|
| 30 |
+
Tips:
|
| 31 |
+
(1)为了让它只输出一个字符,设置 max_tokens=3
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
Model Name:
|
| 35 |
+
Qwen/Qwen3-8B
|
| 36 |
+
deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
|
| 37 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 38 |
+
Tips:
|
| 39 |
+
(1)为了让它只输出一个字符,设置 max_tokens=1
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
Model Name:
|
| 44 |
+
baidu/ERNIE-4.5-300B-A47B
|
| 45 |
+
Tips:
|
| 46 |
+
(1)它可能使用的是bpe 分词, logit_bias 注释掉。
|
| 47 |
+
|
| 48 |
+
"""
|
| 49 |
+
import argparse
|
| 50 |
+
from datetime import datetime
|
| 51 |
+
import json
|
| 52 |
+
import os
|
| 53 |
+
from pathlib import Path
|
| 54 |
+
import sys
|
| 55 |
+
import time
|
| 56 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 57 |
+
|
| 58 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 59 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 60 |
+
|
| 61 |
+
from openai import OpenAI
|
| 62 |
+
|
| 63 |
+
from project_settings import environment, project_path
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_args():
|
| 67 |
+
parser = argparse.ArgumentParser()
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--model_name",
|
| 70 |
+
# default="Pro/deepseek-ai/DeepSeek-R1",
|
| 71 |
+
# default="tencent/Hunyuan-A13B-Instruct",
|
| 72 |
+
default="Qwen/Qwen3-8B",
|
| 73 |
+
# default="deepseek-ai/DeepSeek-R1",
|
| 74 |
+
# default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
|
| 75 |
+
# default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 76 |
+
# default="baidu/ERNIE-4.5-300B-A47B",
|
| 77 |
+
type=str
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--eval_dataset_name",
|
| 81 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 82 |
+
# default="arc-easy-1000-choice.jsonl",
|
| 83 |
+
type=str
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--eval_dataset_dir",
|
| 87 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 88 |
+
type=str
|
| 89 |
+
)
|
| 90 |
+
parser.add_argument(
|
| 91 |
+
"--eval_data_dir",
|
| 92 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 93 |
+
type=str
|
| 94 |
+
)
|
| 95 |
+
parser.add_argument(
|
| 96 |
+
"--client",
|
| 97 |
+
default="shenzhen_sase",
|
| 98 |
+
type=str
|
| 99 |
+
)
|
| 100 |
+
parser.add_argument(
|
| 101 |
+
"--service",
|
| 102 |
+
default="siliconflow_api_key",
|
| 103 |
+
type=str
|
| 104 |
+
)
|
| 105 |
+
parser.add_argument(
|
| 106 |
+
"--create_time_str",
|
| 107 |
+
default="null",
|
| 108 |
+
type=str
|
| 109 |
+
)
|
| 110 |
+
parser.add_argument(
|
| 111 |
+
"--interval",
|
| 112 |
+
default=10,
|
| 113 |
+
type=int
|
| 114 |
+
)
|
| 115 |
+
args = parser.parse_args()
|
| 116 |
+
return args
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def main():
|
| 120 |
+
args = get_args()
|
| 121 |
+
|
| 122 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 123 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 125 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 126 |
+
|
| 127 |
+
if args.create_time_str == "null":
|
| 128 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 129 |
+
now = datetime.now(tz)
|
| 130 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 131 |
+
# create_time_str = "20250724_090615"
|
| 132 |
+
else:
|
| 133 |
+
create_time_str = args.create_time_str
|
| 134 |
+
|
| 135 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 136 |
+
|
| 137 |
+
model_name_ = args.model_name.replace("/", "#")
|
| 138 |
+
output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 139 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 140 |
+
|
| 141 |
+
api_key = environment.get(args.service, dtype=str)
|
| 142 |
+
client = OpenAI(
|
| 143 |
+
base_url="https://api.siliconflow.cn/v1/",
|
| 144 |
+
# Read your Ark API Key from the environment variable.
|
| 145 |
+
api_key=api_key
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
total = 0
|
| 149 |
+
total_correct = 0
|
| 150 |
+
|
| 151 |
+
# finished
|
| 152 |
+
finished_idx_set = set()
|
| 153 |
+
if os.path.exists(output_file.as_posix()):
|
| 154 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 155 |
+
for row in f:
|
| 156 |
+
row = json.loads(row)
|
| 157 |
+
idx = row["idx"]
|
| 158 |
+
total = row["total"]
|
| 159 |
+
total_correct = row["total_correct"]
|
| 160 |
+
finished_idx_set.add(idx)
|
| 161 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 162 |
+
|
| 163 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 164 |
+
for row in fin:
|
| 165 |
+
row = json.loads(row)
|
| 166 |
+
idx = row["idx"]
|
| 167 |
+
prompt = row["prompt"]
|
| 168 |
+
response = row["response"]
|
| 169 |
+
|
| 170 |
+
if idx in finished_idx_set:
|
| 171 |
+
continue
|
| 172 |
+
finished_idx_set.add(idx)
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
time.sleep(args.interval)
|
| 176 |
+
print(f"sleep: {args.interval}")
|
| 177 |
+
time_begin = time.time()
|
| 178 |
+
completion = client.chat.completions.create(
|
| 179 |
+
model=args.model_name,
|
| 180 |
+
messages=[
|
| 181 |
+
{"role": "user", "content": prompt},
|
| 182 |
+
],
|
| 183 |
+
stream=False,
|
| 184 |
+
# max_tokens=4096,
|
| 185 |
+
max_tokens=1,
|
| 186 |
+
temperature=0.6,
|
| 187 |
+
top_p=0.95,
|
| 188 |
+
logit_bias={
|
| 189 |
+
32: 100,
|
| 190 |
+
33: 100,
|
| 191 |
+
34: 100,
|
| 192 |
+
35: 100,
|
| 193 |
+
36: 100,
|
| 194 |
+
37: 100,
|
| 195 |
+
38: 100,
|
| 196 |
+
39: 100,
|
| 197 |
+
},
|
| 198 |
+
extra_body={
|
| 199 |
+
"thinking_budget": 1
|
| 200 |
+
}
|
| 201 |
+
)
|
| 202 |
+
time_cost = time.time() - time_begin
|
| 203 |
+
print(f"time_cost: {time_cost}")
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
+
# print(f"completion: {completion}")
|
| 209 |
+
prediction = completion.choices[0].message.content
|
| 210 |
+
|
| 211 |
+
correct = 1 if prediction == response else 0
|
| 212 |
+
|
| 213 |
+
total += 1
|
| 214 |
+
total_correct += correct
|
| 215 |
+
score = total_correct / total
|
| 216 |
+
|
| 217 |
+
row_ = {
|
| 218 |
+
"idx": idx,
|
| 219 |
+
"prompt": prompt,
|
| 220 |
+
"response": response,
|
| 221 |
+
"prediction": prediction,
|
| 222 |
+
"correct": correct,
|
| 223 |
+
"total": total,
|
| 224 |
+
"total_correct": total_correct,
|
| 225 |
+
"score": score,
|
| 226 |
+
"time_cost": time_cost,
|
| 227 |
+
}
|
| 228 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 229 |
+
fout.write(f"{row_}\n")
|
| 230 |
+
fout.flush()
|
| 231 |
+
|
| 232 |
+
return
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
if __name__ == "__main__":
|
| 236 |
+
main()
|
llm_eval_script/siliconflow_chat.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
|
| 5 |
+
|
| 6 |
+
https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
|
| 7 |
+
|
| 8 |
+
Model Name:
|
| 9 |
+
Pro/deepseek-ai/DeepSeek-R1
|
| 10 |
+
Tips:
|
| 11 |
+
(1)thinking_budget: Must be greater than or equal to 1
|
| 12 |
+
(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
|
| 13 |
+
|
| 14 |
+
Model Name:
|
| 15 |
+
tencent/Hunyuan-A13B-Instruct
|
| 16 |
+
Tips:
|
| 17 |
+
(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
|
| 18 |
+
max_tokens=4,
|
| 19 |
+
logit_bias={
|
| 20 |
+
32: 100,
|
| 21 |
+
33: 100,
|
| 22 |
+
34: 100,
|
| 23 |
+
35: 100,
|
| 24 |
+
36: 100,
|
| 25 |
+
37: 100,
|
| 26 |
+
},
|
| 27 |
+
|
| 28 |
+
Model Name:
|
| 29 |
+
deepseek-ai/DeepSeek-R1
|
| 30 |
+
Tips:
|
| 31 |
+
(1)为了让它只输出一个字符,设置 max_tokens=3
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
Model Name:
|
| 35 |
+
Qwen/Qwen3-8B
|
| 36 |
+
deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
|
| 37 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 38 |
+
Tips:
|
| 39 |
+
(1)为了让它只输出一个字符,设置 max_tokens=1
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
Model Name:
|
| 44 |
+
baidu/ERNIE-4.5-300B-A47B
|
| 45 |
+
Tips:
|
| 46 |
+
(1)它可能使用的是bpe 分词, logit_bias 注释掉。
|
| 47 |
+
|
| 48 |
+
"""
|
| 49 |
+
import argparse
|
| 50 |
+
from datetime import datetime
|
| 51 |
+
import json
|
| 52 |
+
import os
|
| 53 |
+
from pathlib import Path
|
| 54 |
+
import sys
|
| 55 |
+
import time
|
| 56 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 57 |
+
|
| 58 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 59 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 60 |
+
|
| 61 |
+
from openai import OpenAI
|
| 62 |
+
|
| 63 |
+
from project_settings import environment, project_path
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_args():
|
| 67 |
+
parser = argparse.ArgumentParser()
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
"--model_name",
|
| 70 |
+
# default="Pro/deepseek-ai/DeepSeek-R1",
|
| 71 |
+
# default="tencent/Hunyuan-A13B-Instruct",
|
| 72 |
+
# default="Qwen/Qwen3-8B",
|
| 73 |
+
# default="deepseek-ai/DeepSeek-R1",
|
| 74 |
+
# default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
|
| 75 |
+
default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 76 |
+
# default="baidu/ERNIE-4.5-300B-A47B",
|
| 77 |
+
type=str
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--eval_dataset_name",
|
| 81 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
| 82 |
+
type=str
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--eval_dataset_dir",
|
| 86 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 87 |
+
type=str
|
| 88 |
+
)
|
| 89 |
+
parser.add_argument(
|
| 90 |
+
"--eval_data_dir",
|
| 91 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 92 |
+
type=str
|
| 93 |
+
)
|
| 94 |
+
parser.add_argument(
|
| 95 |
+
"--client",
|
| 96 |
+
default="shenzhen_sase",
|
| 97 |
+
type=str
|
| 98 |
+
)
|
| 99 |
+
parser.add_argument(
|
| 100 |
+
"--service",
|
| 101 |
+
default="siliconflow_api_key",
|
| 102 |
+
type=str
|
| 103 |
+
)
|
| 104 |
+
parser.add_argument(
|
| 105 |
+
"--create_time_str",
|
| 106 |
+
default="null",
|
| 107 |
+
type=str
|
| 108 |
+
)
|
| 109 |
+
parser.add_argument(
|
| 110 |
+
"--interval",
|
| 111 |
+
default=1,
|
| 112 |
+
type=int
|
| 113 |
+
)
|
| 114 |
+
args = parser.parse_args()
|
| 115 |
+
return args
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def main():
|
| 119 |
+
args = get_args()
|
| 120 |
+
|
| 121 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 122 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 123 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 124 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
if args.create_time_str == "null":
|
| 127 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 128 |
+
now = datetime.now(tz)
|
| 129 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 130 |
+
# create_time_str = "20250724_090615"
|
| 131 |
+
else:
|
| 132 |
+
create_time_str = args.create_time_str
|
| 133 |
+
|
| 134 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 135 |
+
|
| 136 |
+
model_name_ = args.model_name.replace("/", "#")
|
| 137 |
+
output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
| 138 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 139 |
+
|
| 140 |
+
api_key = environment.get(args.service, dtype=str)
|
| 141 |
+
client = OpenAI(
|
| 142 |
+
base_url="https://api.siliconflow.cn/v1/",
|
| 143 |
+
# Read your Ark API Key from the environment variable.
|
| 144 |
+
api_key=api_key
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
total = 0
|
| 148 |
+
|
| 149 |
+
# finished
|
| 150 |
+
finished_idx_set = set()
|
| 151 |
+
if os.path.exists(output_file.as_posix()):
|
| 152 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 153 |
+
for row in f:
|
| 154 |
+
row = json.loads(row)
|
| 155 |
+
idx = row["idx"]
|
| 156 |
+
total = row["total"]
|
| 157 |
+
finished_idx_set.add(idx)
|
| 158 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 159 |
+
|
| 160 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 161 |
+
for row in fin:
|
| 162 |
+
row = json.loads(row)
|
| 163 |
+
idx = row["idx"]
|
| 164 |
+
prompt = row["prompt"]
|
| 165 |
+
response = row["response"]
|
| 166 |
+
|
| 167 |
+
if idx in finished_idx_set:
|
| 168 |
+
continue
|
| 169 |
+
finished_idx_set.add(idx)
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
time.sleep(args.interval)
|
| 173 |
+
print(f"sleep: {args.interval}")
|
| 174 |
+
time_begin = time.time()
|
| 175 |
+
completion = client.chat.completions.create(
|
| 176 |
+
model=args.model_name,
|
| 177 |
+
messages=[
|
| 178 |
+
{"role": "user", "content": prompt},
|
| 179 |
+
],
|
| 180 |
+
stream=False,
|
| 181 |
+
max_tokens=4096,
|
| 182 |
+
# max_tokens=1,
|
| 183 |
+
temperature=0.6,
|
| 184 |
+
top_p=0.95,
|
| 185 |
+
# logit_bias={
|
| 186 |
+
# 32: 100,
|
| 187 |
+
# 33: 100,
|
| 188 |
+
# 34: 100,
|
| 189 |
+
# 35: 100,
|
| 190 |
+
# 36: 100,
|
| 191 |
+
# 37: 100,
|
| 192 |
+
# 38: 100,
|
| 193 |
+
# 39: 100,
|
| 194 |
+
# },
|
| 195 |
+
extra_body={
|
| 196 |
+
"thinking_budget": 1
|
| 197 |
+
}
|
| 198 |
+
)
|
| 199 |
+
time_cost = time.time() - time_begin
|
| 200 |
+
print(f"time_cost: {time_cost}")
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
prediction = completion.choices[0].message.content
|
| 206 |
+
|
| 207 |
+
total += 1
|
| 208 |
+
|
| 209 |
+
row_ = {
|
| 210 |
+
"idx": idx,
|
| 211 |
+
"prompt": prompt,
|
| 212 |
+
"response": response,
|
| 213 |
+
"prediction": prediction,
|
| 214 |
+
"total": total,
|
| 215 |
+
"time_cost": time_cost,
|
| 216 |
+
}
|
| 217 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 218 |
+
fout.write(f"{row_}\n")
|
| 219 |
+
fout.flush()
|
| 220 |
+
|
| 221 |
+
return
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
main()
|
log.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
import logging
|
| 5 |
+
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
| 6 |
+
import os
|
| 7 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_converter(tz_info: str = "Asia/Shanghai"):
|
| 11 |
+
def converter(timestamp):
|
| 12 |
+
dt = datetime.fromtimestamp(timestamp, ZoneInfo(tz_info))
|
| 13 |
+
result = dt.timetuple()
|
| 14 |
+
return result
|
| 15 |
+
return converter
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def setup_size_rotating(log_directory: str, tz_info: str = "Asia/Shanghai"):
|
| 19 |
+
fmt = "%(asctime)s|%(name)s|%(levelname)s|%(filename)s|%(lineno)d|%(message)s"
|
| 20 |
+
|
| 21 |
+
formatter = logging.Formatter(
|
| 22 |
+
fmt=fmt,
|
| 23 |
+
datefmt="%Y-%m-%d %H:%M:%S %z"
|
| 24 |
+
)
|
| 25 |
+
formatter.converter = get_converter(tz_info)
|
| 26 |
+
|
| 27 |
+
stream_handler = logging.StreamHandler()
|
| 28 |
+
stream_handler.setLevel(logging.INFO)
|
| 29 |
+
stream_handler.setFormatter(formatter)
|
| 30 |
+
|
| 31 |
+
# main
|
| 32 |
+
main_logger = logging.getLogger("main")
|
| 33 |
+
main_logger.addHandler(stream_handler)
|
| 34 |
+
main_info_file_handler = RotatingFileHandler(
|
| 35 |
+
filename=os.path.join(log_directory, "main.log"),
|
| 36 |
+
maxBytes=100*1024*1024, # 100MB
|
| 37 |
+
encoding="utf-8",
|
| 38 |
+
backupCount=2,
|
| 39 |
+
)
|
| 40 |
+
main_info_file_handler.setLevel(logging.INFO)
|
| 41 |
+
main_info_file_handler.setFormatter(formatter)
|
| 42 |
+
main_logger.addHandler(main_info_file_handler)
|
| 43 |
+
|
| 44 |
+
# http
|
| 45 |
+
http_logger = logging.getLogger("http")
|
| 46 |
+
http_logger.addHandler(stream_handler)
|
| 47 |
+
http_file_handler = RotatingFileHandler(
|
| 48 |
+
filename=os.path.join(log_directory, "http.log"),
|
| 49 |
+
maxBytes=100*1024*1024, # 100MB
|
| 50 |
+
encoding="utf-8",
|
| 51 |
+
backupCount=2,
|
| 52 |
+
)
|
| 53 |
+
http_file_handler.setLevel(logging.DEBUG)
|
| 54 |
+
http_file_handler.setFormatter(formatter)
|
| 55 |
+
http_logger.addHandler(http_file_handler)
|
| 56 |
+
|
| 57 |
+
# api
|
| 58 |
+
api_logger = logging.getLogger("api")
|
| 59 |
+
api_logger.addHandler(stream_handler)
|
| 60 |
+
api_file_handler = RotatingFileHandler(
|
| 61 |
+
filename=os.path.join(log_directory, "api.log"),
|
| 62 |
+
maxBytes=10*1024*1024, # 10MB
|
| 63 |
+
encoding="utf-8",
|
| 64 |
+
backupCount=2,
|
| 65 |
+
)
|
| 66 |
+
api_file_handler.setLevel(logging.DEBUG)
|
| 67 |
+
api_file_handler.setFormatter(formatter)
|
| 68 |
+
api_logger.addHandler(api_file_handler)
|
| 69 |
+
|
| 70 |
+
# alarm
|
| 71 |
+
alarm_logger = logging.getLogger("alarm")
|
| 72 |
+
alarm_file_handler = RotatingFileHandler(
|
| 73 |
+
filename=os.path.join(log_directory, "alarm.log"),
|
| 74 |
+
maxBytes=1*1024*1024, # 1MB
|
| 75 |
+
encoding="utf-8",
|
| 76 |
+
backupCount=2,
|
| 77 |
+
)
|
| 78 |
+
alarm_file_handler.setLevel(logging.DEBUG)
|
| 79 |
+
alarm_file_handler.setFormatter(logging.Formatter(fmt))
|
| 80 |
+
alarm_logger.addHandler(alarm_file_handler)
|
| 81 |
+
|
| 82 |
+
debug_file_handler = RotatingFileHandler(
|
| 83 |
+
filename=os.path.join(log_directory, "debug.log"),
|
| 84 |
+
maxBytes=1*1024*1024, # 1MB
|
| 85 |
+
encoding="utf-8",
|
| 86 |
+
backupCount=2,
|
| 87 |
+
)
|
| 88 |
+
debug_file_handler.setLevel(logging.DEBUG)
|
| 89 |
+
debug_file_handler.setFormatter(logging.Formatter(fmt))
|
| 90 |
+
|
| 91 |
+
info_file_handler = RotatingFileHandler(
|
| 92 |
+
filename=os.path.join(log_directory, "info.log"),
|
| 93 |
+
maxBytes=1*1024*1024, # 1MB
|
| 94 |
+
encoding="utf-8",
|
| 95 |
+
backupCount=2,
|
| 96 |
+
)
|
| 97 |
+
info_file_handler.setLevel(logging.INFO)
|
| 98 |
+
info_file_handler.setFormatter(logging.Formatter(fmt))
|
| 99 |
+
|
| 100 |
+
error_file_handler = RotatingFileHandler(
|
| 101 |
+
filename=os.path.join(log_directory, "error.log"),
|
| 102 |
+
maxBytes=1*1024*1024, # 1MB
|
| 103 |
+
encoding="utf-8",
|
| 104 |
+
backupCount=2,
|
| 105 |
+
)
|
| 106 |
+
error_file_handler.setLevel(logging.ERROR)
|
| 107 |
+
error_file_handler.setFormatter(logging.Formatter(fmt))
|
| 108 |
+
|
| 109 |
+
logging.basicConfig(
|
| 110 |
+
level=logging.DEBUG,
|
| 111 |
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
| 112 |
+
handlers=[
|
| 113 |
+
debug_file_handler,
|
| 114 |
+
info_file_handler,
|
| 115 |
+
error_file_handler,
|
| 116 |
+
]
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def setup_time_rotating(log_directory: str):
|
| 121 |
+
fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
|
| 122 |
+
|
| 123 |
+
stream_handler = logging.StreamHandler()
|
| 124 |
+
stream_handler.setLevel(logging.INFO)
|
| 125 |
+
stream_handler.setFormatter(logging.Formatter(fmt))
|
| 126 |
+
|
| 127 |
+
# main
|
| 128 |
+
main_logger = logging.getLogger("main")
|
| 129 |
+
main_logger.addHandler(stream_handler)
|
| 130 |
+
main_info_file_handler = TimedRotatingFileHandler(
|
| 131 |
+
filename=os.path.join(log_directory, "main.log"),
|
| 132 |
+
encoding="utf-8",
|
| 133 |
+
when="midnight",
|
| 134 |
+
interval=1,
|
| 135 |
+
backupCount=7
|
| 136 |
+
)
|
| 137 |
+
main_info_file_handler.setLevel(logging.INFO)
|
| 138 |
+
main_info_file_handler.setFormatter(logging.Formatter(fmt))
|
| 139 |
+
main_logger.addHandler(main_info_file_handler)
|
| 140 |
+
|
| 141 |
+
# http
|
| 142 |
+
http_logger = logging.getLogger("http")
|
| 143 |
+
http_file_handler = TimedRotatingFileHandler(
|
| 144 |
+
filename=os.path.join(log_directory, "http.log"),
|
| 145 |
+
encoding='utf-8',
|
| 146 |
+
when="midnight",
|
| 147 |
+
interval=1,
|
| 148 |
+
backupCount=7
|
| 149 |
+
)
|
| 150 |
+
http_file_handler.setLevel(logging.DEBUG)
|
| 151 |
+
http_file_handler.setFormatter(logging.Formatter(fmt))
|
| 152 |
+
http_logger.addHandler(http_file_handler)
|
| 153 |
+
|
| 154 |
+
# api
|
| 155 |
+
api_logger = logging.getLogger("api")
|
| 156 |
+
api_file_handler = TimedRotatingFileHandler(
|
| 157 |
+
filename=os.path.join(log_directory, "api.log"),
|
| 158 |
+
encoding='utf-8',
|
| 159 |
+
when="midnight",
|
| 160 |
+
interval=1,
|
| 161 |
+
backupCount=7
|
| 162 |
+
)
|
| 163 |
+
api_file_handler.setLevel(logging.DEBUG)
|
| 164 |
+
api_file_handler.setFormatter(logging.Formatter(fmt))
|
| 165 |
+
api_logger.addHandler(api_file_handler)
|
| 166 |
+
|
| 167 |
+
# alarm
|
| 168 |
+
alarm_logger = logging.getLogger("alarm")
|
| 169 |
+
alarm_file_handler = TimedRotatingFileHandler(
|
| 170 |
+
filename=os.path.join(log_directory, "alarm.log"),
|
| 171 |
+
encoding="utf-8",
|
| 172 |
+
when="midnight",
|
| 173 |
+
interval=1,
|
| 174 |
+
backupCount=7
|
| 175 |
+
)
|
| 176 |
+
alarm_file_handler.setLevel(logging.DEBUG)
|
| 177 |
+
alarm_file_handler.setFormatter(logging.Formatter(fmt))
|
| 178 |
+
alarm_logger.addHandler(alarm_file_handler)
|
| 179 |
+
|
| 180 |
+
debug_file_handler = TimedRotatingFileHandler(
|
| 181 |
+
filename=os.path.join(log_directory, "debug.log"),
|
| 182 |
+
encoding="utf-8",
|
| 183 |
+
when="D",
|
| 184 |
+
interval=1,
|
| 185 |
+
backupCount=7
|
| 186 |
+
)
|
| 187 |
+
debug_file_handler.setLevel(logging.DEBUG)
|
| 188 |
+
debug_file_handler.setFormatter(logging.Formatter(fmt))
|
| 189 |
+
|
| 190 |
+
info_file_handler = TimedRotatingFileHandler(
|
| 191 |
+
filename=os.path.join(log_directory, "info.log"),
|
| 192 |
+
encoding="utf-8",
|
| 193 |
+
when="D",
|
| 194 |
+
interval=1,
|
| 195 |
+
backupCount=7
|
| 196 |
+
)
|
| 197 |
+
info_file_handler.setLevel(logging.INFO)
|
| 198 |
+
info_file_handler.setFormatter(logging.Formatter(fmt))
|
| 199 |
+
|
| 200 |
+
error_file_handler = TimedRotatingFileHandler(
|
| 201 |
+
filename=os.path.join(log_directory, "error.log"),
|
| 202 |
+
encoding="utf-8",
|
| 203 |
+
when="D",
|
| 204 |
+
interval=1,
|
| 205 |
+
backupCount=7
|
| 206 |
+
)
|
| 207 |
+
error_file_handler.setLevel(logging.ERROR)
|
| 208 |
+
error_file_handler.setFormatter(logging.Formatter(fmt))
|
| 209 |
+
|
| 210 |
+
logging.basicConfig(
|
| 211 |
+
level=logging.DEBUG,
|
| 212 |
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
| 213 |
+
handlers=[
|
| 214 |
+
debug_file_handler,
|
| 215 |
+
info_file_handler,
|
| 216 |
+
error_file_handler,
|
| 217 |
+
]
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
pass
|
main.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
docker build -t llm_eval_system:v20250724_1442 .
|
| 5 |
+
|
| 6 |
+
docker stop llm_eval_system_7862 && docker rm llm_eval_system_7862
|
| 7 |
+
|
| 8 |
+
docker run -itd \
|
| 9 |
+
--name llm_eval_system_7862 \
|
| 10 |
+
--restart=always \
|
| 11 |
+
--network host \
|
| 12 |
+
-e port=7862 \
|
| 13 |
+
llm_eval_system:v20250724_1442 \
|
| 14 |
+
/bin/bash
|
| 15 |
+
"""
|
| 16 |
+
import argparse
|
| 17 |
+
import logging
|
| 18 |
+
import platform
|
| 19 |
+
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
from project_settings import environment, project_path, log_directory
|
| 23 |
+
from toolbox.os.command import Command
|
| 24 |
+
import log
|
| 25 |
+
|
| 26 |
+
log.setup_size_rotating(log_directory=log_directory)
|
| 27 |
+
|
| 28 |
+
logger = logging.getLogger("main")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_args():
|
| 32 |
+
parser = argparse.ArgumentParser()
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--eval_data_dir",
|
| 35 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 36 |
+
type=str,
|
| 37 |
+
)
|
| 38 |
+
args = parser.parse_args()
|
| 39 |
+
return args
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def shell(cmd: str):
|
| 43 |
+
return Command.popen(cmd)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main():
|
| 47 |
+
args = get_args()
|
| 48 |
+
|
| 49 |
+
# ui
|
| 50 |
+
with gr.Blocks() as blocks:
|
| 51 |
+
with gr.Tabs():
|
| 52 |
+
with gr.TabItem("shell"):
|
| 53 |
+
shell_text = gr.Textbox(label="cmd")
|
| 54 |
+
shell_button = gr.Button("run")
|
| 55 |
+
shell_output = gr.Textbox(label="output", max_lines=100)
|
| 56 |
+
|
| 57 |
+
shell_button.click(
|
| 58 |
+
shell,
|
| 59 |
+
inputs=[shell_text, ],
|
| 60 |
+
outputs=[shell_output],
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# http://127.0.0.1:7861/
|
| 64 |
+
# http://10.75.27.247:7861/
|
| 65 |
+
blocks.queue().launch(
|
| 66 |
+
share=False if platform.system() == "Windows" else False,
|
| 67 |
+
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
| 68 |
+
# server_name="0.0.0.0",
|
| 69 |
+
server_port=environment.get("port", 7860, dtype=int),
|
| 70 |
+
)
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
main()
|
project_settings.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from toolbox.os.environment import EnvironmentManager
|
| 7 |
+
|
| 8 |
+
project_path = os.path.abspath(os.path.dirname(__file__))
|
| 9 |
+
project_path = Path(project_path)
|
| 10 |
+
|
| 11 |
+
log_directory = project_path / "logs"
|
| 12 |
+
log_directory.mkdir(parents=True, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
environment = EnvironmentManager(
|
| 15 |
+
filename=os.path.join(project_path, "dotenv/dev.env"),
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
time_zone_info = "Asia/Shanghai"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == '__main__':
|
| 22 |
+
pass
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
python-dotenv
|
| 3 |
+
pandas
|
| 4 |
+
openpyxl
|
tabs/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
tabs/fs_tab.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from project_settings import project_path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_fs_tab():
|
| 9 |
+
with gr.TabItem("fs"):
|
| 10 |
+
with gr.Row():
|
| 11 |
+
with gr.Column(scale=3):
|
| 12 |
+
fs_filename = gr.Textbox(label="filename", max_lines=10)
|
| 13 |
+
fs_file = gr.File(label="file")
|
| 14 |
+
# fs_file_dir = gr.Textbox(value="data", label="file_dir")
|
| 15 |
+
fs_file_dir = gr.Dropdown(choices=["data/dataset", "data/eval_data"],
|
| 16 |
+
value="data/dataset",
|
| 17 |
+
label="file_dir")
|
| 18 |
+
fs_query = gr.Button("query", variant="primary")
|
| 19 |
+
with gr.Column(scale=7):
|
| 20 |
+
fs_filelist_dataset_state = gr.State(value=[])
|
| 21 |
+
fs_filelist_dataset = gr.Dataset(
|
| 22 |
+
components=[fs_filename, fs_file],
|
| 23 |
+
samples=fs_filelist_dataset_state.value,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def when_click_query_files(file_dir: str = "data"):
|
| 27 |
+
file_dir = project_path / file_dir
|
| 28 |
+
dataset_state = list()
|
| 29 |
+
for filename in file_dir.glob("**/*.*"):
|
| 30 |
+
if filename.is_dir():
|
| 31 |
+
continue
|
| 32 |
+
if filename.stem.startswith("."):
|
| 33 |
+
continue
|
| 34 |
+
if filename.name.endswith(".py"):
|
| 35 |
+
continue
|
| 36 |
+
if filename.name.endswith(".raw"):
|
| 37 |
+
continue
|
| 38 |
+
dataset_state.append((
|
| 39 |
+
filename.relative_to(file_dir).as_posix(),
|
| 40 |
+
filename.as_posix(),
|
| 41 |
+
))
|
| 42 |
+
|
| 43 |
+
dataset = gr.Dataset(
|
| 44 |
+
components=[fs_filename, fs_file],
|
| 45 |
+
samples=dataset_state,
|
| 46 |
+
)
|
| 47 |
+
return dataset_state, dataset
|
| 48 |
+
|
| 49 |
+
fs_filelist_dataset.click(
|
| 50 |
+
fn=lambda x: (
|
| 51 |
+
x[1], x[1]
|
| 52 |
+
),
|
| 53 |
+
inputs=[fs_filelist_dataset],
|
| 54 |
+
outputs=[fs_filename, fs_file]
|
| 55 |
+
)
|
| 56 |
+
fs_query.click(
|
| 57 |
+
fn=when_click_query_files,
|
| 58 |
+
inputs=[fs_file_dir],
|
| 59 |
+
outputs=[fs_filelist_dataset_state, fs_filelist_dataset]
|
| 60 |
+
)
|
| 61 |
+
return locals()
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
if __name__ == "__main__":
|
| 65 |
+
with gr.Blocks() as block:
|
| 66 |
+
fs_components = get_fs_tab()
|
| 67 |
+
block.launch()
|
tabs/shell_tab.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from toolbox.os.command import Command
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def shell(cmd: str):
|
| 9 |
+
return Command.popen(cmd)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_shell_tab():
|
| 13 |
+
with gr.TabItem("shell"):
|
| 14 |
+
shell_text = gr.Textbox(label="cmd")
|
| 15 |
+
shell_button = gr.Button("run")
|
| 16 |
+
shell_output = gr.Textbox(label="output", max_lines=100)
|
| 17 |
+
|
| 18 |
+
shell_button.click(
|
| 19 |
+
shell,
|
| 20 |
+
inputs=[shell_text, ],
|
| 21 |
+
outputs=[shell_output],
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
return locals()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
pass
|
toolbox/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
toolbox/os/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == '__main__':
|
| 6 |
+
pass
|
toolbox/os/command.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Command(object):
|
| 7 |
+
custom_command = [
|
| 8 |
+
"cd"
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
@staticmethod
|
| 12 |
+
def _get_cmd(command):
|
| 13 |
+
command = str(command).strip()
|
| 14 |
+
if command == "":
|
| 15 |
+
return None
|
| 16 |
+
cmd_and_args = command.split(sep=" ")
|
| 17 |
+
cmd = cmd_and_args[0]
|
| 18 |
+
args = " ".join(cmd_and_args[1:])
|
| 19 |
+
return cmd, args
|
| 20 |
+
|
| 21 |
+
@classmethod
|
| 22 |
+
def popen(cls, command):
|
| 23 |
+
cmd, args = cls._get_cmd(command)
|
| 24 |
+
if cmd in cls.custom_command:
|
| 25 |
+
method = getattr(cls, cmd)
|
| 26 |
+
return method(args)
|
| 27 |
+
else:
|
| 28 |
+
resp = os.popen(command)
|
| 29 |
+
result = resp.read()
|
| 30 |
+
resp.close()
|
| 31 |
+
return result
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def cd(cls, args):
|
| 35 |
+
if args.startswith("/"):
|
| 36 |
+
os.chdir(args)
|
| 37 |
+
else:
|
| 38 |
+
pwd = os.getcwd()
|
| 39 |
+
path = os.path.join(pwd, args)
|
| 40 |
+
os.chdir(path)
|
| 41 |
+
|
| 42 |
+
@classmethod
|
| 43 |
+
def system(cls, command):
|
| 44 |
+
return os.system(command)
|
| 45 |
+
|
| 46 |
+
def __init__(self):
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def ps_ef_grep(keyword: str):
|
| 51 |
+
cmd = "ps -ef | grep {}".format(keyword)
|
| 52 |
+
rows = Command.popen(cmd)
|
| 53 |
+
rows = str(rows).split("\n")
|
| 54 |
+
rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
|
| 55 |
+
return rows
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
pass
|
toolbox/os/environment.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from dotenv.main import DotEnv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class EnvironmentManager(object):
|
| 10 |
+
def __init__(self, filename, override=False):
|
| 11 |
+
self.filename = filename
|
| 12 |
+
|
| 13 |
+
load_dotenv(
|
| 14 |
+
dotenv_path=filename,
|
| 15 |
+
override=override
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def open_dotenv(self, filename: str = None):
|
| 19 |
+
filename = filename or self.filename
|
| 20 |
+
dotenv = DotEnv(
|
| 21 |
+
dotenv_path=filename,
|
| 22 |
+
stream=None,
|
| 23 |
+
verbose=False,
|
| 24 |
+
interpolate=False,
|
| 25 |
+
override=False,
|
| 26 |
+
encoding="utf-8",
|
| 27 |
+
)
|
| 28 |
+
result = dotenv.dict()
|
| 29 |
+
return result
|
| 30 |
+
|
| 31 |
+
def get(self, key, default=None, dtype=str):
|
| 32 |
+
result = os.environ.get(key)
|
| 33 |
+
if result is None:
|
| 34 |
+
if default is None:
|
| 35 |
+
result = None
|
| 36 |
+
else:
|
| 37 |
+
result = default
|
| 38 |
+
else:
|
| 39 |
+
result = dtype(result)
|
| 40 |
+
return result
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == '__main__':
|
| 48 |
+
main()
|
toolbox/os/other.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import inspect
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def pwd():
|
| 6 |
+
"""你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
|
| 7 |
+
frame = inspect.stack()[1]
|
| 8 |
+
module = inspect.getmodule(frame[0])
|
| 9 |
+
return os.path.dirname(os.path.abspath(module.__file__))
|