Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import argparse | |
| import glob | |
| import json | |
| import os | |
| import random | |
| from collections import Counter | |
| import numpy as np | |
| import pandas as pd | |
| from openhands.events.serialization import event_from_dict | |
| from openhands.events.utils import get_pairs_from_events | |
| ERROR_KEYWORDS = [ | |
| 'Agent encountered an error while processing the last action', | |
| 'APIError', | |
| 'Action execution failed', | |
| 'litellm.Timeout: APITimeoutError', | |
| ] | |
| def get_bootstrap_accuracy_error_bars( | |
| values: float | int | bool, num_samples: int = 1000, p_value=0.05 | |
| ) -> tuple[float, float]: | |
| sorted_vals = np.sort( | |
| [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)] | |
| ) | |
| bottom_idx = int(num_samples * p_value / 2) | |
| top_idx = int(num_samples * (1.0 - p_value / 2)) | |
| return (sorted_vals[bottom_idx], sorted_vals[top_idx]) | |
| def process_file(file_path): | |
| with open(file_path, 'r') as file: | |
| lines = file.readlines() | |
| num_lines = len(lines) | |
| num_error_lines = 0 | |
| num_agent_stuck_in_loop = 0 | |
| num_resolved = 0 | |
| resolved_arr = [] | |
| num_empty_patch = 0 | |
| num_unfinished_runs = 0 | |
| error_counter = Counter() | |
| main_agent_cost = [] | |
| editor_cost = [] | |
| num_turns = [] | |
| for line in lines: | |
| _d = json.loads(line) | |
| if 'metrics' not in _d or _d['metrics'] is None: | |
| # this is a failed run | |
| num_unfinished_runs += 1 | |
| continue | |
| # Cost | |
| costs = _d['metrics'].get('costs', []) | |
| _cur_main_agent_cost = 0 | |
| _cur_editor_cost = 0 | |
| for cost in costs: | |
| if isinstance(cost, float): | |
| # backward compatible | |
| _cur_main_agent_cost += cost | |
| else: | |
| if 'draft_editor' in cost['model']: | |
| _cur_editor_cost += cost['cost'] | |
| else: | |
| _cur_main_agent_cost += cost['cost'] | |
| main_agent_cost.append(_cur_main_agent_cost) | |
| editor_cost.append(_cur_editor_cost) | |
| # Turn status | |
| history = _d.get('history', []) | |
| events = [event_from_dict(event) for event in history] | |
| pairs = get_pairs_from_events(events) | |
| num_turns.append(len(pairs)) | |
| # Patch & resolve status | |
| patch = _d.get('test_result', {}).get('git_patch', '') | |
| if patch == '': | |
| num_empty_patch += 1 | |
| continue | |
| report = _d.get('report', {}) or {} | |
| resolved = report.get('resolved', False) | |
| if resolved: | |
| num_resolved += 1 | |
| resolved_arr.append(1) | |
| else: | |
| resolved_arr.append(0) | |
| # Error | |
| error = _d.get('error', None) | |
| if error is not None and isinstance(error, str): | |
| agent_stuck_in_loop = 'Agent got stuck in a loop' in error | |
| contains_error = bool(error) and not agent_stuck_in_loop | |
| if agent_stuck_in_loop: | |
| error_counter['Agent got stuck in a loop'] += 1 | |
| num_agent_stuck_in_loop += 1 | |
| elif contains_error: | |
| error_counter[error] += 1 | |
| continue | |
| for keyword in ERROR_KEYWORDS: | |
| if keyword in line: | |
| error_counter[keyword] += 1 | |
| num_error_lines += 1 | |
| break | |
| return { | |
| 'file_path': file_path, | |
| 'total_instances': num_lines, | |
| 'resolved': { | |
| 'count': num_resolved, | |
| 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, | |
| 'ci': tuple( | |
| x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr) | |
| ), | |
| }, | |
| 'empty_patches': { | |
| 'count': num_empty_patch, | |
| 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0, | |
| }, | |
| 'unfinished_runs': { | |
| 'count': num_unfinished_runs, | |
| 'percentage': (num_unfinished_runs / num_lines * 100) | |
| if num_lines > 0 | |
| else 0, | |
| }, | |
| 'errors': { | |
| 'total': num_error_lines, | |
| 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0, | |
| 'stuck_in_loop': { | |
| 'count': num_agent_stuck_in_loop, | |
| 'percentage': (num_agent_stuck_in_loop / num_lines * 100) | |
| if num_lines > 0 | |
| else 0, | |
| }, | |
| 'breakdown': { | |
| str(error): { | |
| 'count': count, | |
| 'percentage': (count / num_lines * 100) if num_lines > 0 else 0, | |
| } | |
| for error, count in error_counter.items() | |
| }, | |
| }, | |
| 'costs': { | |
| 'main_agent': sum(main_agent_cost), | |
| 'editor': sum(editor_cost), | |
| 'total': sum(main_agent_cost) + sum(editor_cost), | |
| }, | |
| 'statistics': { | |
| 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0, | |
| 'costs': { | |
| 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0, | |
| 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0, | |
| 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines | |
| if num_lines > 0 | |
| else 0, | |
| }, | |
| }, | |
| } | |
| def aggregate_directory(input_path) -> pd.DataFrame: | |
| # Process all output.jsonl files in subdirectories | |
| pattern = os.path.join(input_path, '**/output.jsonl') | |
| files = glob.glob(pattern, recursive=True) | |
| print(f'Processing {len(files)} files from directory {input_path}') | |
| # Process each file silently and collect results | |
| results = [] | |
| for file_path in files: | |
| try: | |
| result = process_file(file_path) | |
| results.append(result) | |
| except Exception as e: | |
| print(f'Error processing {file_path}: {str(e)}') | |
| import traceback | |
| traceback.print_exc() | |
| continue | |
| # Convert results to pandas DataFrame and sort by resolve rate | |
| df = pd.DataFrame(results) | |
| # Extract directory name from file path | |
| df['directory'] = df['file_path'].apply( | |
| lambda x: os.path.basename(os.path.dirname(x)) | |
| ) | |
| df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) | |
| df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci']) | |
| df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) | |
| df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) | |
| df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) | |
| df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) | |
| df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) | |
| df = df.sort_values('resolve_rate', ascending=False) | |
| return df | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| 'input_path', type=str, help='The file or directory to summarize' | |
| ) | |
| parser.add_argument( | |
| '--output', | |
| type=str, | |
| help='Output JSONL file for results', | |
| default='summary_results.jsonl', | |
| ) | |
| args = parser.parse_args() | |
| if os.path.isdir(args.input_path): | |
| df = aggregate_directory(args.input_path) | |
| # Create the summary string | |
| columns = [ | |
| 'directory', | |
| 'resolve_rate', | |
| 'empty_patch_rate', | |
| 'unfinished_rate', | |
| 'error_rate', | |
| 'avg_turns', | |
| 'avg_cost', | |
| 'total_instances', | |
| ] | |
| summary_str = df[columns].to_string( | |
| float_format=lambda x: '{:.2f}'.format(x), | |
| formatters={ | |
| 'directory': lambda x: x[:90] | |
| }, # Truncate directory names to 20 chars | |
| index=False, | |
| ) | |
| # Print to console | |
| print('\nResults summary (sorted by resolve rate):') | |
| print(summary_str) | |
| # Save to text file | |
| txt_output = args.output.rsplit('.', 1)[0] + '.txt' | |
| with open(txt_output, 'w') as f: | |
| f.write('Results summary (sorted by resolve rate):\n') | |
| f.write(summary_str) | |
| # Save | |
| df.to_json(args.output, lines=True, orient='records') | |
| df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False) | |
| else: | |
| # Process single file with detailed output | |
| results = [] | |
| try: | |
| result = process_file(args.input_path) | |
| results.append(result) | |
| # Print detailed results for single file | |
| print(f'\nResults for {args.input_path}:') | |
| print( | |
| f'Number of resolved: {result["resolved"]["count"]} / {result["total_instances"]} ({result["resolved"]["percentage"]:.2f}% [{result["resolved"]["ci"][0]:.2f}%, {result["resolved"]["ci"][1]:.2f}%])' | |
| ) | |
| print( | |
| f'Number of empty patch: {result["empty_patches"]["count"]} / {result["total_instances"]} ({result["empty_patches"]["percentage"]:.2f}%)' | |
| ) | |
| print( | |
| f'Number of error lines: {result["errors"]["total"]} / {result["total_instances"]} ({result["errors"]["percentage"]:.2f}%)' | |
| ) | |
| print( | |
| f'Number of agent stuck in loop: {result["errors"]["stuck_in_loop"]["count"]} / {result["total_instances"]} ({result["errors"]["stuck_in_loop"]["percentage"]:.2f}%)' | |
| ) | |
| print( | |
| f'Number of unfinished runs: {result["unfinished_runs"]["count"]} / {result["total_instances"]} ({result["unfinished_runs"]["percentage"]:.2f}%)' | |
| ) | |
| print(f'Total cost: {result["costs"]["total"]:.2f} USD') | |
| print('## Statistics') | |
| print( | |
| f'Avg. num of turns per instance: {result["statistics"]["avg_turns"]:.2f}' | |
| ) | |
| print( | |
| f'Avg. agent cost per instance: {result["statistics"]["costs"]["main_agent"]:.2f} USD' | |
| ) | |
| print( | |
| f'Avg. editor cost per instance: {result["statistics"]["costs"]["editor"]:.2f} USD' | |
| ) | |
| print( | |
| f'Avg. total cost per instance: {result["statistics"]["costs"]["total"]:.2f} USD' | |
| ) | |
| print('## Detailed error breakdown:') | |
| for error, data in result['errors']['breakdown'].items(): | |
| print(f'{error}: {data["count"]} ({data["percentage"]:.2f}%)') | |
| except Exception as e: | |
| print(f'Error processing {args.input_path}: {str(e)}') | |