Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import argparse | |
| import json | |
| from collections import Counter | |
| from openhands.events.serialization import event_from_dict | |
| from openhands.events.utils import get_pairs_from_events | |
| ERROR_KEYWORDS = [ | |
| 'Agent encountered an error while processing the last action', | |
| 'APIError', | |
| 'Action execution failed', | |
| ] | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('output_file', type=str, help='The file to summarize') | |
| args = parser.parse_args() | |
| with open(args.output_file, 'r') as file: | |
| lines = file.readlines() | |
| num_lines = len(lines) | |
| num_error_lines = 0 | |
| num_agent_stuck_in_loop = 0 | |
| coverage = 0 | |
| mutation_score = 0 | |
| num_empty_suite = 0 | |
| error_counter = Counter() | |
| main_agent_cost = [] | |
| editor_cost = [] | |
| num_turns = [] | |
| for line in lines: | |
| _d = json.loads(line) | |
| # Cost | |
| costs = _d['metrics'].get('costs', []) | |
| _cur_main_agent_cost = 0 | |
| _cur_editor_cost = 0 | |
| for cost in costs: | |
| if isinstance(cost, float): | |
| # backward compatible | |
| _cur_main_agent_cost += cost | |
| else: | |
| if 'draft_editor' in cost['model']: | |
| _cur_editor_cost += cost['cost'] | |
| else: | |
| _cur_main_agent_cost += cost['cost'] | |
| main_agent_cost.append(_cur_main_agent_cost) | |
| editor_cost.append(_cur_editor_cost) | |
| # Turn status | |
| history = _d.get('history', []) | |
| events = [event_from_dict(event) for event in history] | |
| pairs = get_pairs_from_events(events) | |
| num_turns.append(len(pairs)) | |
| # Suite & resolve status | |
| suite = _d.get('test_result', {}).get('test_suite', '') | |
| if suite == '': | |
| num_empty_suite += 1 | |
| continue | |
| report = _d.get('report', {}) or {} | |
| coverage += report.get('coverage', 0) | |
| mutation_score += report.get('mutation_score', 0) | |
| # Error | |
| error = _d.get('error', None) | |
| if error is not None and isinstance(error, str): | |
| agent_stuck_in_loop = 'Agent got stuck in a loop' in error | |
| contains_error = bool(error) and not agent_stuck_in_loop | |
| if agent_stuck_in_loop: | |
| error_counter['Agent got stuck in a loop'] += 1 | |
| num_agent_stuck_in_loop += 1 | |
| elif contains_error: | |
| error_counter[error] += 1 | |
| continue | |
| for keyword in ERROR_KEYWORDS: | |
| if keyword in line: | |
| error_counter[keyword] += 1 | |
| num_error_lines += 1 | |
| break | |
| # print the error counter (with percentage) | |
| print(f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)') | |
| print( | |
| f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)' | |
| ) | |
| print( | |
| f'Number of empty suite: {num_empty_suite} / {num_lines} ({num_empty_suite / num_lines * 100:.2f}%)' | |
| ) | |
| print( | |
| f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' | |
| ) | |
| print( | |
| f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' | |
| ) | |
| assert len(num_turns) == num_lines | |
| assert len(main_agent_cost) == num_lines | |
| assert len(editor_cost) == num_lines | |
| print('## Statistics') | |
| print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}') | |
| print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD') | |
| print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD') | |
| print( | |
| f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD' | |
| ) | |
| print('## Detailed error breakdown:') | |
| for error, count in error_counter.items(): | |
| print(f'{error}: {count} ({count / num_lines * 100:.2f}%)') | |