Spaces:
Build error
Build error
| import asyncio | |
| import copy | |
| import json | |
| import os | |
| import tempfile | |
| from typing import Any, Literal | |
| import pandas as pd | |
| import toml | |
| from datasets import load_dataset | |
| import openhands.agenthub | |
| from evaluation.benchmarks.swe_bench.binary_patch_utils import ( | |
| remove_binary_diffs, | |
| remove_binary_files_from_git, | |
| ) | |
| from evaluation.benchmarks.swe_bench.resource.mapping import ( | |
| get_instance_resource_factor, | |
| ) | |
| from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import ( | |
| MAP_REPO_TO_INSTALL, | |
| MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE, | |
| MAP_VERSION_TO_INSTALL, | |
| ) | |
| from evaluation.utils.shared import ( | |
| EvalException, | |
| EvalMetadata, | |
| EvalOutput, | |
| assert_and_raise, | |
| codeact_user_response, | |
| get_default_sandbox_config_for_eval, | |
| get_metrics, | |
| is_fatal_evaluation_error, | |
| make_metadata, | |
| prepare_dataset, | |
| reset_logger_for_multiprocessing, | |
| run_evaluation, | |
| update_llm_config_for_completions_logging, | |
| ) | |
| from openhands.controller.state.state import State | |
| from openhands.core.config import ( | |
| AgentConfig, | |
| OpenHandsConfig, | |
| get_llm_config_arg, | |
| get_parser, | |
| ) | |
| from openhands.core.config.condenser_config import NoOpCondenserConfig | |
| from openhands.core.config.utils import get_condenser_config_arg | |
| from openhands.core.logger import openhands_logger as logger | |
| from openhands.core.main import create_runtime, run_controller | |
| from openhands.critic import AgentFinishedCritic | |
| from openhands.events.action import CmdRunAction, FileReadAction, MessageAction | |
| from openhands.events.observation import ( | |
| CmdOutputObservation, | |
| ErrorObservation, | |
| FileReadObservation, | |
| ) | |
| from openhands.events.serialization.event import event_from_dict, event_to_dict | |
| from openhands.runtime.base import Runtime | |
| from openhands.utils.async_utils import call_async_from_sync | |
| from openhands.utils.shutdown_listener import sleep_if_should_continue | |
| USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' | |
| RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' | |
| BenchMode = Literal['swe', 'swt', 'swt-ci'] | |
| AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { | |
| 'CodeActAgent': codeact_user_response, | |
| } | |
| def _get_swebench_workspace_dir_name(instance: pd.Series) -> str: | |
| return f'{instance.repo}__{instance.version}'.replace('/', '__') | |
| def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction: | |
| workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
| mode = metadata.details['mode'] | |
| if mode.startswith('swt'): | |
| test_instructions = ( | |
| f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n' | |
| if mode.endswith('ci') | |
| else '' | |
| ) | |
| instruction = f"""\ | |
| <uploaded_files> | |
| /workspace/{workspace_dir_name} | |
| </uploaded_files> | |
| I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description: | |
| <issue_description> | |
| {instance.problem_statement} | |
| </issue_description> | |
| Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved? | |
| I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests! | |
| Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved. | |
| Follow these steps to reproduce the issue: | |
| 1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure. | |
| 2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error | |
| 3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework | |
| 4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests. | |
| {test_instructions}Your thinking should be thorough and so it's fine if it's very long. | |
| """ | |
| else: | |
| instruction = f""" | |
| <uploaded_files> | |
| /workspace/{workspace_dir_name} | |
| </uploaded_files> | |
| I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description: | |
| <issue_description> | |
| {instance.problem_statement} | |
| </issue_description> | |
| Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met? | |
| I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way! | |
| Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages. | |
| Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied. | |
| Follow these phases to resolve the issue: | |
| Phase 1. READING: read the problem and reword it in clearer terms | |
| 1.1 If there are code or config snippets. Express in words any best practices or conventions in them. | |
| 1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details. | |
| 1.3 Explain the problem in clear terms. | |
| 1.4 Enumerate the steps to reproduce the problem. | |
| 1.5 Hightlight any best practices to take into account when testing and fixing the issue | |
| Phase 2. RUNNING: install and run the tests on the repository | |
| 2.1 Follow the readme | |
| 2.2 Install the environment and anything needed | |
| 2.2 Iterate and figure out how to run the tests | |
| Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions | |
| 3.1 Use `grep` to search for relevant methods, classes, keywords and error messages. | |
| 3.2 Identify all files related to the problem statement. | |
| 3.3 Propose the methods and files to fix the issue and explain why. | |
| 3.4 From the possible file locations, select the most likely location to fix the issue. | |
| Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue. | |
| 4.1 Look at existing test files in the repository to understand the test format/structure. | |
| 4.2 Create a minimal reproduction script that reproduces the located issue. | |
| 4.3 Run the reproduction script to confirm you are reproducing the issue. | |
| 4.4 Adjust the reproduction script as necessary. | |
| Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it | |
| 5.1 State clearly what the problem is. | |
| 5.2 State clearly where the problem is located. | |
| 5.3 State clearly how the test reproduces the issue. | |
| 5.4 State clearly the best practices to take into account in the fix. | |
| 5.5 State clearly how to fix the problem. | |
| Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution. | |
| 6.1 Make minimal, focused changes to fix the issue. | |
| Phase 7. VERIFICATION: Test your implementation thoroughly. | |
| 7.1 Run your reproduction script to verify the fix works. | |
| 7.2 Add edge cases to your test script to ensure comprehensive coverage. | |
| 7.3 Run existing tests related to the modified code to ensure you haven't broken anything. | |
| 8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}. | |
| 8.1 Ensure you've fully addressed all requirements. | |
| 8.2 Run any tests in the repository related to: | |
| 8.2.1 The issue you are fixing | |
| 8.2.2 The files you modified | |
| 8.2.3 The functions you changed | |
| 8.3 If any tests fail, revise your implementation until all tests pass | |
| Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity. | |
| """ | |
| if RUN_WITH_BROWSING: | |
| instruction += ( | |
| '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n' | |
| ) | |
| if 'image_assets' in instance: | |
| assets = json.loads(instance['image_assets']) | |
| assert 'problem_statement' in assets, ( | |
| 'problem_statement is required in image_assets' | |
| ) | |
| image_urls = assets['problem_statement'] | |
| return MessageAction(content=instruction, image_urls=image_urls) | |
| return MessageAction(content=instruction) | |
| # TODO: migrate all swe-bench docker to ghcr.io/openhands | |
| DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get( | |
| 'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/' | |
| ) | |
| logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}') | |
| def get_instance_docker_image( | |
| instance_id: str, | |
| swebench_official_image: bool = False, | |
| ) -> str: | |
| if swebench_official_image: | |
| # Official SWE-Bench image | |
| # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 | |
| docker_image_prefix = 'docker.io/swebench/' | |
| repo, name = instance_id.split('__') | |
| image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower() | |
| logger.debug(f'Using official SWE-Bench image: {image_name}') | |
| return image_name | |
| else: | |
| # OpenHands version of the image | |
| docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX | |
| image_name = 'sweb.eval.x86_64.' + instance_id | |
| image_name = image_name.replace( | |
| '__', '_s_' | |
| ) # to comply with docker image naming convention | |
| return (docker_image_prefix.rstrip('/') + '/' + image_name).lower() | |
| def get_config( | |
| instance: pd.Series, | |
| metadata: EvalMetadata, | |
| ) -> OpenHandsConfig: | |
| # We use a different instance image for the each instance of swe-bench eval | |
| use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower() | |
| base_container_image = get_instance_docker_image( | |
| instance['instance_id'], | |
| swebench_official_image=use_swebench_official_image, | |
| ) | |
| logger.info( | |
| f'Using instance container image: {base_container_image}. ' | |
| f'Please make sure this image exists. ' | |
| f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' | |
| ) | |
| sandbox_config = get_default_sandbox_config_for_eval() | |
| sandbox_config.base_container_image = base_container_image | |
| sandbox_config.enable_auto_lint = True | |
| sandbox_config.use_host_network = False | |
| # Add platform to the sandbox config to solve issue 4401 | |
| sandbox_config.platform = 'linux/amd64' | |
| sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor( | |
| dataset_name=metadata.dataset, | |
| instance_id=instance['instance_id'], | |
| ) | |
| config = OpenHandsConfig( | |
| default_agent=metadata.agent_class, | |
| run_as_openhands=False, | |
| max_iterations=metadata.max_iterations, | |
| runtime=os.environ.get('RUNTIME', 'docker'), | |
| sandbox=sandbox_config, | |
| # do not mount workspace | |
| workspace_base=None, | |
| workspace_mount_path=None, | |
| ) | |
| config.set_llm_config( | |
| update_llm_config_for_completions_logging( | |
| metadata.llm_config, metadata.eval_output_dir, instance['instance_id'] | |
| ) | |
| ) | |
| agent_config = AgentConfig( | |
| enable_jupyter=False, | |
| enable_browsing=RUN_WITH_BROWSING, | |
| enable_llm_editor=False, | |
| enable_mcp=False, | |
| condenser=metadata.condenser_config, | |
| enable_prompt_extensions=False, | |
| ) | |
| config.set_agent_config(agent_config) | |
| return config | |
| def initialize_runtime( | |
| runtime: Runtime, | |
| instance: pd.Series, # this argument is not required | |
| metadata: EvalMetadata, | |
| ): | |
| """Initialize the runtime for the agent. | |
| This function is called before the runtime is used to run the agent. | |
| """ | |
| logger.info('-' * 30) | |
| logger.info('BEGIN Runtime Initialization Fn') | |
| logger.info('-' * 30) | |
| workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
| obs: CmdOutputObservation | |
| # Set instance id and git configuration | |
| action = CmdRunAction( | |
| command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false""" | |
| ) | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| obs.exit_code == 0, | |
| f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}', | |
| ) | |
| action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') | |
| # inject the init script | |
| script_dir = os.path.dirname(__file__) | |
| # inject the instance info | |
| action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| obs.exit_code == 0, | |
| f'Failed to create /swe_util/eval_data/instances: {str(obs)}', | |
| ) | |
| swe_instance_json_name = 'swe-bench-instance.json' | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Construct the full path for the desired file name within the temporary directory | |
| temp_file_path = os.path.join(temp_dir, swe_instance_json_name) | |
| # Write to the file with the desired name within the temporary directory | |
| with open(temp_file_path, 'w') as f: | |
| if not isinstance(instance, dict): | |
| json.dump([instance.to_dict()], f) | |
| else: | |
| json.dump([instance], f) | |
| # Copy the file to the desired location | |
| runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') | |
| # inject the instance swe entry | |
| runtime.copy_to( | |
| str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')), | |
| '/swe_util/', | |
| ) | |
| action = CmdRunAction(command='cat ~/.bashrc') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') | |
| action = CmdRunAction(command='source ~/.bashrc') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| if isinstance(obs, ErrorObservation): | |
| logger.error(f'Failed to source ~/.bashrc: {str(obs)}') | |
| assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') | |
| action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| obs.exit_code == 0, | |
| f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}', | |
| ) | |
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| obs.exit_code == 0, | |
| f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
| ) | |
| action = CmdRunAction(command='git reset --hard') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}') | |
| action = CmdRunAction( | |
| command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done' | |
| ) | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') | |
| if metadata.details['mode'] == 'swt-ci': | |
| # set up repo | |
| setup_commands = [] | |
| if instance['repo'] in MAP_REPO_TO_INSTALL: | |
| setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']]) | |
| # Run pre-install set up if provided | |
| install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get( | |
| instance['version'], [] | |
| ) | |
| if 'pre_install' in install: | |
| for pre_install in install['pre_install']: | |
| setup_commands.append(pre_install) | |
| if 'install' in install: | |
| setup_commands.append(install['install']) | |
| for command in setup_commands: | |
| action = CmdRunAction(command=command) | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| if 'multimodal' not in metadata.dataset.lower(): | |
| # Only for non-multimodal datasets, we need to activate the testbed environment for Python | |
| # SWE-Bench multimodal datasets are not using the testbed environment | |
| action = CmdRunAction(command='which python') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| obs.exit_code == 0 and 'testbed' in obs.content, | |
| f'Expected to find python interpreter from testbed, but got: {str(obs)}', | |
| ) | |
| logger.info('-' * 30) | |
| logger.info('END Runtime Initialization Fn') | |
| logger.info('-' * 30) | |
| def complete_runtime( | |
| runtime: Runtime, | |
| instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name | |
| ) -> dict[str, Any]: | |
| """Complete the runtime for the agent. | |
| This function is called before the runtime is used to run the agent. | |
| If you need to do something in the sandbox to get the correctness metric after | |
| the agent has run, modify this function. | |
| """ | |
| logger.info('-' * 30) | |
| logger.info('BEGIN Runtime Completion Fn') | |
| logger.info('-' * 30) | |
| obs: CmdOutputObservation | |
| workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| if obs.exit_code == -1: | |
| # The previous command is still running | |
| # We need to kill previous command | |
| logger.info('The previous command is still running, trying to kill it...') | |
| action = CmdRunAction(command='C-c') | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| # Then run the command again | |
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| if obs.exit_code == -1: | |
| # The previous command is still running | |
| # We need to kill previous command | |
| logger.info('The previous command is still running, trying to ctrl+z it...') | |
| action = CmdRunAction(command='C-z') | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| # Then run the command again | |
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
| ) | |
| action = CmdRunAction(command='git config --global core.pager ""') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to git config --global core.pager "": {str(obs)}', | |
| ) | |
| # First check for any git repositories in subdirectories | |
| action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to find git repositories: {str(obs)}', | |
| ) | |
| git_dirs = [p for p in obs.content.strip().split('\n') if p] | |
| if git_dirs: | |
| # Remove all .git directories in subdirectories | |
| for git_dir in git_dirs: | |
| action = CmdRunAction(command=f'rm -rf "{git_dir}"') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to remove git directory {git_dir}: {str(obs)}', | |
| ) | |
| # add all files | |
| action = CmdRunAction(command='git add -A') | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to git add -A: {str(obs)}', | |
| ) | |
| # Remove binary files from git staging | |
| action = CmdRunAction(command=remove_binary_files_from_git()) | |
| action.set_hard_timeout(600) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| assert_and_raise( | |
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
| f'Failed to remove binary files: {str(obs)}', | |
| ) | |
| n_retries = 0 | |
| git_patch = None | |
| while n_retries < 5: | |
| action = CmdRunAction( | |
| command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff' | |
| ) | |
| action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| n_retries += 1 | |
| if isinstance(obs, CmdOutputObservation): | |
| if obs.exit_code == 0: | |
| # Read the patch file | |
| action = FileReadAction(path='patch.diff') | |
| action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| if isinstance(obs, FileReadObservation): | |
| git_patch = obs.content | |
| break | |
| elif isinstance(obs, ErrorObservation): | |
| # Fall back to cat "patch.diff" to get the patch | |
| assert 'File could not be decoded as utf-8' in obs.content | |
| action = CmdRunAction(command='cat patch.diff') | |
| action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
| logger.info(action, extra={'msg_type': 'ACTION'}) | |
| obs = runtime.run_action(action) | |
| assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0 | |
| logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
| git_patch = obs.content | |
| break | |
| else: | |
| assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
| else: | |
| logger.info('Failed to get git diff, retrying...') | |
| sleep_if_should_continue(10) | |
| elif isinstance(obs, ErrorObservation): | |
| logger.error(f'Error occurred: {obs.content}. Retrying...') | |
| sleep_if_should_continue(10) | |
| else: | |
| assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
| assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') | |
| # Remove binary diffs from the patch | |
| git_patch = remove_binary_diffs(git_patch) | |
| logger.info('-' * 30) | |
| logger.info('END Runtime Completion Fn') | |
| logger.info('-' * 30) | |
| return {'git_patch': git_patch} | |
| def process_instance( | |
| instance: pd.Series, | |
| metadata: EvalMetadata, | |
| reset_logger: bool = True, | |
| runtime_failure_count: int = 0, | |
| ) -> EvalOutput: | |
| config = get_config(instance, metadata) | |
| # Setup the logger properly, so you can run multi-processing to parallelize the evaluation | |
| if reset_logger: | |
| log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') | |
| reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) | |
| else: | |
| logger.info(f'Starting evaluation for instance {instance.instance_id}.') | |
| # Increase resource_factor with increasing attempt_id | |
| if runtime_failure_count > 0: | |
| config.sandbox.remote_runtime_resource_factor = min( | |
| config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), | |
| 8, | |
| ) | |
| logger.warning( | |
| f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' | |
| ) | |
| metadata = copy.deepcopy(metadata) | |
| metadata.details['runtime_failure_count'] = runtime_failure_count | |
| metadata.details['remote_runtime_resource_factor'] = ( | |
| config.sandbox.remote_runtime_resource_factor | |
| ) | |
| runtime = create_runtime(config) | |
| call_async_from_sync(runtime.connect) | |
| try: | |
| initialize_runtime(runtime, instance, metadata) | |
| message_action = get_instruction(instance, metadata) | |
| # Here's how you can run the agent (similar to the `main` function) and get the final task state | |
| state: State | None = asyncio.run( | |
| run_controller( | |
| config=config, | |
| initial_user_action=message_action, | |
| runtime=runtime, | |
| fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ | |
| metadata.agent_class | |
| ], | |
| ) | |
| ) | |
| # if fatal error, throw EvalError to trigger re-run | |
| if is_fatal_evaluation_error(state.last_error): | |
| raise EvalException('Fatal error detected: ' + state.last_error) | |
| # ======= THIS IS SWE-Bench specific ======= | |
| # Get git patch | |
| return_val = complete_runtime(runtime, instance) | |
| git_patch = return_val['git_patch'] | |
| logger.info( | |
| f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------' | |
| ) | |
| finally: | |
| runtime.close() | |
| # ========================================== | |
| # ======= Attempt to evaluate the agent's edits ======= | |
| # we use eval_infer.sh to evaluate the agent's edits, not here | |
| # because the agent may alter the environment / testcases | |
| test_result = { | |
| 'git_patch': git_patch, | |
| } | |
| # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) | |
| # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. | |
| if state is None: | |
| raise ValueError('State should not be None.') | |
| # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events | |
| histories = [event_to_dict(event) for event in state.history] | |
| metrics = get_metrics(state) | |
| # Save the output | |
| instruction = message_action.content | |
| if message_action.image_urls: | |
| instruction += ( | |
| '\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>' | |
| ) | |
| output = EvalOutput( | |
| instance_id=instance.instance_id, | |
| instruction=instruction, | |
| instance=instance.to_dict(), # SWE Bench specific | |
| test_result=test_result, | |
| metadata=metadata, | |
| history=histories, | |
| metrics=metrics, | |
| error=state.last_error if state and state.last_error else None, | |
| ) | |
| return output | |
| def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: | |
| file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml') | |
| if os.path.exists(file_path): | |
| with open(file_path, 'r') as file: | |
| data = toml.load(file) | |
| if 'selected_ids' in data: | |
| selected_ids = data['selected_ids'] | |
| logger.info( | |
| f'Filtering {len(selected_ids)} tasks from "selected_ids"...' | |
| ) | |
| subset = dataset[dataset[filter_column].isin(selected_ids)] | |
| logger.info(f'Retained {subset.shape[0]} tasks after filtering') | |
| return subset | |
| if 'selected_repos' in data: | |
| # repos for the swe-bench instances: | |
| # ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy'] | |
| selected_repos = data['selected_repos'] | |
| if isinstance(selected_repos, str): | |
| selected_repos = [selected_repos] | |
| assert isinstance(selected_repos, list) | |
| logger.info( | |
| f'Filtering {selected_repos} tasks from "selected_repos"...' | |
| ) | |
| subset = dataset[dataset['repo'].isin(selected_repos)] | |
| logger.info(f'Retained {subset.shape[0]} tasks after filtering') | |
| return subset | |
| skip_ids = os.environ.get('SKIP_IDS', '').split(',') | |
| if len(skip_ids) > 0: | |
| logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...') | |
| return dataset[~dataset[filter_column].isin(skip_ids)] | |
| return dataset | |
| if __name__ == '__main__': | |
| parser = get_parser() | |
| parser.add_argument( | |
| '--dataset', | |
| type=str, | |
| default='princeton-nlp/SWE-bench', | |
| help='data set to evaluate on, either full-test or lite-test', | |
| ) | |
| parser.add_argument( | |
| '--split', | |
| type=str, | |
| default='test', | |
| help='split to evaluate on', | |
| ) | |
| parser.add_argument( | |
| '--mode', | |
| type=str, | |
| default='swe', | |
| choices=['swe', 'swt', 'swt-ci'], | |
| help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'", | |
| ) | |
| args, _ = parser.parse_known_args() | |
| # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing | |
| # so we don't need to manage file uploading to OpenHands's repo | |
| dataset = load_dataset(args.dataset, split=args.split) | |
| swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') | |
| logger.info( | |
| f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' | |
| ) | |
| if 'SWE-Gym' in args.dataset: | |
| with open( | |
| os.path.join( | |
| os.path.dirname(os.path.abspath(__file__)), | |
| 'split', | |
| 'swegym_verified_instances.json', | |
| ), | |
| 'r', | |
| ) as f: | |
| swegym_verified_instances = json.load(f) | |
| swe_bench_tests = swe_bench_tests[ | |
| swe_bench_tests['instance_id'].isin(swegym_verified_instances) | |
| ] | |
| logger.info( | |
| f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances' | |
| ) | |
| llm_config = None | |
| if args.llm_config: | |
| llm_config = get_llm_config_arg(args.llm_config) | |
| llm_config.log_completions = True | |
| # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
| llm_config.modify_params = False | |
| if llm_config is None: | |
| raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') | |
| # Get condenser config from environment variable | |
| condenser_name = os.environ.get('EVAL_CONDENSER') | |
| if condenser_name: | |
| condenser_config = get_condenser_config_arg(condenser_name) | |
| if condenser_config is None: | |
| raise ValueError( | |
| f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}' | |
| ) | |
| else: | |
| # If no specific condenser config is provided via env var, default to NoOpCondenser | |
| condenser_config = NoOpCondenserConfig() | |
| logger.debug( | |
| 'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.' | |
| ) | |
| details = {'mode': args.mode} | |
| _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) | |
| dataset_descrption = ( | |
| args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__') | |
| ) | |
| metadata = make_metadata( | |
| llm_config, | |
| dataset_descrption, | |
| args.agent_cls, | |
| args.max_iterations, | |
| args.eval_note, | |
| args.eval_output_dir, | |
| details=details, | |
| condenser_config=condenser_config, | |
| ) | |
| output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') | |
| print(f'### OUTPUT FILE: {output_file} ###') | |
| # Run evaluation in iterative mode: | |
| # If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made. | |
| ITERATIVE_EVAL_MODE = ( | |
| os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true' | |
| ) | |
| ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int( | |
| os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3') | |
| ) | |
| if not ITERATIVE_EVAL_MODE: | |
| # load the dataset | |
| instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) | |
| if len(instances) > 0 and not isinstance( | |
| instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str | |
| ): | |
| for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: | |
| instances[col] = instances[col].apply(lambda x: str(x)) | |
| run_evaluation( | |
| instances, | |
| metadata, | |
| output_file, | |
| args.eval_num_workers, | |
| process_instance, | |
| timeout_seconds=8 | |
| * 60 | |
| * 60, # 8 hour PER instance should be more than enough | |
| max_retries=5, | |
| ) | |
| else: | |
| critic = AgentFinishedCritic() | |
| def get_cur_output_file_path(attempt: int) -> str: | |
| return ( | |
| f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl' | |
| ) | |
| eval_ids = None | |
| for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1): | |
| cur_output_file = get_cur_output_file_path(attempt) | |
| logger.info( | |
| f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.' | |
| ) | |
| # For deterministic eval, we set temperature to 0.1 for (>1) attempt | |
| # so hopefully we get slightly different results | |
| if attempt > 1 and metadata.llm_config.temperature == 0: | |
| logger.info( | |
| f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...' | |
| ) | |
| metadata.llm_config.temperature = 0.1 | |
| # Load instances - at first attempt, we evaluate all instances | |
| # On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic | |
| instances = prepare_dataset( | |
| swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids | |
| ) | |
| if len(instances) > 0 and not isinstance( | |
| instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str | |
| ): | |
| for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: | |
| instances[col] = instances[col].apply(lambda x: str(x)) | |
| # Run evaluation - but save them to cur_output_file | |
| logger.info( | |
| f'Evaluating {len(instances)} instances for attempt {attempt}...' | |
| ) | |
| run_evaluation( | |
| instances, | |
| metadata, | |
| cur_output_file, | |
| args.eval_num_workers, | |
| process_instance, | |
| timeout_seconds=8 | |
| * 60 | |
| * 60, # 8 hour PER instance should be more than enough | |
| max_retries=5, | |
| ) | |
| # When eval is done, we update eval_ids to the instances that failed the current attempt | |
| instances_failed = [] | |
| logger.info( | |
| f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...' | |
| ) | |
| with open(cur_output_file, 'r') as f: | |
| for line in f: | |
| instance = json.loads(line) | |
| try: | |
| history = [ | |
| event_from_dict(event) for event in instance['history'] | |
| ] | |
| critic_result = critic.evaluate( | |
| history, instance['test_result'].get('git_patch', '') | |
| ) | |
| if not critic_result.success: | |
| instances_failed.append(instance['instance_id']) | |
| except Exception as e: | |
| logger.error( | |
| f'Error loading history for instance {instance["instance_id"]}: {e}' | |
| ) | |
| instances_failed.append(instance['instance_id']) | |
| logger.info( | |
| f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}' | |
| ) | |
| eval_ids = instances_failed | |
| # If no instances failed, we break | |
| if len(instances_failed) == 0: | |
| break | |
| # Then we should aggregate the results from all attempts into the original output file | |
| # and remove the intermediate files | |
| logger.info( | |
| 'Aggregating results from all attempts into the original output file...' | |
| ) | |
| fout = open(output_file, 'w') | |
| added_instance_ids = set() | |
| for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)): | |
| cur_output_file = get_cur_output_file_path(attempt) | |
| if not os.path.exists(cur_output_file): | |
| logger.warning( | |
| f'Intermediate output file {cur_output_file} does not exist. Skipping...' | |
| ) | |
| continue | |
| with open(cur_output_file, 'r') as f: | |
| for line in f: | |
| instance = json.loads(line) | |
| # Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else) | |
| if ( | |
| instance['instance_id'] not in added_instance_ids | |
| and instance['test_result'].get('git_patch', '').strip() | |
| ): | |
| fout.write(line) | |
| added_instance_ids.add(instance['instance_id']) | |
| logger.info( | |
| f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}' | |
| ) | |
| fout.close() | |
| logger.info( | |
| f'Done! Total {len(added_instance_ids)} instances added to {output_file}' | |
| ) | |