Spaces:
Sleeping
Sleeping
| #!/usr/bin/python3 | |
| # -*- coding: utf-8 -*- | |
| import argparse | |
| import json | |
| import os | |
| from pathlib import Path | |
| import sys | |
| import time | |
| pwd = os.path.abspath(os.path.dirname(__file__)) | |
| sys.path.append(os.path.join(pwd, "../../")) | |
| from project_settings import environment, project_path | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--raw_dataset", | |
| default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(), | |
| type=str | |
| ) | |
| parser.add_argument( | |
| "--dataset", | |
| default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(), | |
| type=str | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| def main(): | |
| args = get_args() | |
| raw_dataset = Path(args.raw_dataset) | |
| dataset = Path(args.dataset) | |
| dataset.parent.mkdir(parents=True, exist_ok=True) | |
| with open(dataset.as_posix(), "w", encoding="utf-8") as fout: | |
| for sample_dir in raw_dataset.glob("*"): | |
| idx = sample_dir.parts[-1] | |
| system_prompt_file = sample_dir / "system_prompt.txt" | |
| user_prompt_file = sample_dir / "user_prompt.txt" | |
| response_file = sample_dir / "response.txt" | |
| with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
| system_prompt = f.read() | |
| with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
| user_prompt = f.read() | |
| with open(response_file.as_posix(), "r", encoding="utf-8") as f: | |
| response = f.read() | |
| prompt = f"""{system_prompt}\n\n{user_prompt}""".strip() | |
| print(f"{prompt}\n\n{response}") | |
| print("-" * 150) | |
| row_ = { | |
| "idx": idx, | |
| "prompt": prompt, | |
| "response": response, | |
| } | |
| row_ = json.dumps(row_, ensure_ascii=False) | |
| fout.write(f"{row_}\n") | |
| return | |
| if __name__ == "__main__": | |
| main() | |