Spaces:
Build error
Build error
| import json | |
| import os | |
| import pprint | |
| import tqdm | |
| from openhands.core.config import get_llm_config_arg, get_parser, load_openhands_config | |
| from openhands.core.logger import openhands_logger as logger | |
| from openhands.llm.llm import LLM | |
| config = load_openhands_config() | |
| def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: | |
| passed = [] | |
| failed = [] | |
| costs = [] | |
| instance_ids = set() | |
| instances = [] | |
| with open(res_file_path, 'r') as file: | |
| for line in file: | |
| data = json.loads(line.strip()) | |
| success = data['metrics']['success'] | |
| if data['instance_id'] in instance_ids: | |
| print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}') | |
| continue | |
| instance_ids.add(data['instance_id']) | |
| instances.append(data) | |
| if success: | |
| passed.append( | |
| { | |
| 'instance_id': data['instance_id'], | |
| 'repo': data['repo'], | |
| 'instruction': data['instruction'], | |
| 'eval_script': data['eval_script'], | |
| 'eval_exit_code': data['eval_exit_code'], | |
| 'eval_output': data['eval_output'], | |
| 'accumulated_cost': data['metrics']['accumulated_cost'], | |
| } | |
| ) | |
| else: | |
| failed.append( | |
| { | |
| 'instance_id': data['instance_id'], | |
| 'repo': data['repo'], | |
| 'instruction': data['instruction'], | |
| 'metadata': data['metadata'], | |
| 'history': data['history'], | |
| 'eval_script': data['eval_script'], | |
| 'eval_exit_code': data['eval_exit_code'], | |
| 'eval_output': data['eval_output'], | |
| 'accumulated_cost': data['metrics']['accumulated_cost'], | |
| } | |
| ) | |
| costs.append(data['metrics']['accumulated_cost']) | |
| # sort by instance_id | |
| instances.sort(key=lambda x: x['instance_id']) | |
| with open(res_file_path, 'w') as file: | |
| for instance in instances: | |
| file.write(json.dumps(instance) + '\n') | |
| return passed, failed, costs | |
| def classify_error(llm: LLM, failed_case: dict) -> str: | |
| prompt = f""" | |
| Please classify the error for the following failed case based on the history and eval_output: | |
| Instruction: | |
| {failed_case['instruction']} | |
| Eval Script: | |
| {failed_case['eval_script']}s | |
| History: | |
| {failed_case['history']} | |
| Eval Output: | |
| {failed_case['eval_output']} | |
| The error categories are: | |
| E1: Hallucination Errors - The model misinterpreted the user's intention, misplaced Python code and bash script, or generated random or irrelevant code. | |
| E2: Lack of Knowledge or Information - The model lacks sufficient information or domain-specific knowledge to satisfy the user's requirements. | |
| E3: Knowledge Manipulation - The model failed to integrate or manipulate information properly. | |
| E4: Syntax Errors - The model generated code with syntax errors. | |
| E5: Operational Error - The model gave up easily or exited without finishing the tasks. | |
| Please provide only the error category (E1, E2, E3, E4, or E5) without any explanation. | |
| """ | |
| try: | |
| response = llm.completion(messages=[{'content': prompt, 'role': 'user'}]) | |
| error_category = response.choices[0].message['content'] | |
| except Exception as e: | |
| logger.error( | |
| f'Failed to classify the error for the failed case: {failed_case["instance_id"]}' | |
| ) | |
| logger.error(e) | |
| error_category = input( | |
| failed_case['instruction'] | |
| + ': ' | |
| + failed_case['eval_script'] | |
| + ' - ' | |
| + failed_case['eval_output'] | |
| ) | |
| if error_category not in ['E1', 'E2', 'E3', 'E4', 'E5']: | |
| raise ValueError(f'Invalid error category: {error_category}') | |
| return error_category | |
| if __name__ == '__main__': | |
| parser = get_parser() | |
| parser.add_argument( | |
| '--json_file_path', | |
| type=str, | |
| required=True, | |
| help='Path to the jsonl file containing the evaluation results', | |
| ) | |
| args, _ = parser.parse_known_args() | |
| # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/swe_bench/README.md#configure-openhands-and-your-llm | |
| # for details of how to set `llm_config` | |
| if args.llm_config: | |
| specified_llm_config = get_llm_config_arg(args.llm_config) | |
| # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
| specified_llm_config.modify_params = False | |
| if specified_llm_config: | |
| config.llm = specified_llm_config | |
| logger.info(f'Config for evaluation: {config}') | |
| llm = LLM(llm_config=specified_llm_config) | |
| passed, new_failed, costs = extract_test_results(args.json_file_path) | |
| failed = [] | |
| if os.path.exists(args.json_file_path.replace('.jsonl', '_failed.jsonl')): | |
| with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'r') as file: | |
| for line in file: | |
| failed.append(json.loads(line.strip())) | |
| print( | |
| f'Loaded {len(failed)} failed cases from {args.json_file_path.replace(".jsonl", "_failed.jsonl")}' | |
| ) | |
| for failed_case in tqdm.tqdm(new_failed): | |
| if failed_case['instance_id'] in [case['instance_id'] for case in failed]: | |
| continue | |
| error_category = classify_error(llm, failed_case) | |
| failed_case['error_category'] = error_category | |
| failed.append(failed_case) | |
| with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'a') as file: | |
| file.write(json.dumps(failed_case) + '\n') | |
| # Print the summary | |
| print('Summary:') | |
| print(f'Passed: {len(passed)}') | |
| print(f'Failed: {len(failed)}') | |
| print(f'Costs: {costs}') | |
| print('Failed cases:') | |
| error_categories = {} | |
| for case in failed: | |
| error_category = case['error_category'] | |
| if error_category not in error_categories: | |
| error_categories[error_category] = 0 | |
| error_categories[error_category] += 1 | |
| pprint.pprint(error_categories) | |