Spaces:
Build error
Build error
| # NOTE: this script is for rolling out the SWE-Gym dataset for **TRAINING** | |
| # For more information, please refer to | |
| # 1. the Github Repo: https://github.com/SWE-Gym/SWE-Gym | |
| # 2. the paper: https://arxiv.org/abs/2412.21139 | |
| MODEL=$1 # eg your llm config name in config.toml (eg: "llm.claude-3-5-sonnet-20241022-t05") | |
| EXP_NAME=$2 # "train-t05" | |
| N_WORKERS=${3:-64} | |
| N_RUNS=${4:-1} | |
| export EXP_NAME=$EXP_NAME | |
| # use 2x resources for rollout since some codebases are pretty resource-intensive | |
| export DEFAULT_RUNTIME_RESOURCE_FACTOR=2 | |
| echo "MODEL: $MODEL" | |
| echo "EXP_NAME: $EXP_NAME" | |
| DATASET="SWE-Gym/SWE-Gym" # change this to the "/SWE-Gym-Lite" if you want to rollout the lite subset | |
| SPLIT="train" | |
| if [ -z "$ALLHANDS_API_KEY" ]; then | |
| echo "ALLHANDS_API_KEY is not set. Will rollout and evaluate locally using Docker. WARNING: A large value of N_WORKERS will result in a large number of Docker containers being spun up and may crash your machine." | |
| export RUNTIME=docker | |
| else | |
| echo "ALLHANDS_API_KEY is set. Continuing rollout and evaluation with remote runtime..." | |
| export RUNTIME=remote | |
| export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" | |
| export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" | |
| fi | |
| EVAL_LIMIT=3000 | |
| MAX_ITER=100 | |
| # ===== Run inference ===== | |
| source "evaluation/utils/version_control.sh" | |
| get_openhands_version | |
| echo "OPENHANDS_VERSION: $OPENHANDS_VERSION" | |
| echo "MODEL_CONFIG: $MODEL_CONFIG" | |
| echo "DATASET: $DATASET" | |
| echo "SPLIT: $SPLIT" | |
| # Default to NOT use Hint | |
| export USE_INSTANCE_IMAGE=true | |
| export USE_HINT_TEXT=false | |
| export RUN_WITH_BROWSING=false | |
| echo "USE_HINT_TEXT: $USE_HINT_TEXT" | |
| EVAL_NOTE="$OPENHANDS_VERSION-no-hint-$EXP_NAME" | |
| function run_eval() { | |
| local eval_note=$1 | |
| COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \ | |
| --agent-cls CodeActAgent \ | |
| --llm-config $MODEL \ | |
| --max-iterations $MAX_ITER \ | |
| --eval-num-workers $N_WORKERS \ | |
| --eval-note $eval_note \ | |
| --dataset $DATASET \ | |
| --split $SPLIT" | |
| if [ -n "$EVAL_LIMIT" ]; then | |
| echo "EVAL_LIMIT: $EVAL_LIMIT" | |
| COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" | |
| fi | |
| # Run the command | |
| eval $COMMAND | |
| } | |
| for run_idx in $(seq 1 $N_RUNS); do | |
| while true; do | |
| echo "### Running inference... ###" | |
| unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push | |
| current_eval_note="$EVAL_NOTE-run_$run_idx" | |
| echo "EVAL_NOTE: $current_eval_note" | |
| INFER_OUTPUT=$(run_eval $current_eval_note) | |
| INFER_STATUS=$? # Capture the exit status of run_infer.sh | |
| echo "INFER_STATUS: $INFER_STATUS" | |
| echo "### Cleaning up remote runtime... ###" | |
| ./evaluation/utils/scripts/cleanup_remote_runtime.sh | |
| if [ $INFER_STATUS -eq 0 ]; then | |
| echo "### Inference completed successfully. ###" | |
| break | |
| else | |
| echo "### Inference failed with exit code $INFER_STATUS. Retrying... ###" | |
| fi | |
| done | |
| # Extract the output directory using the special delimiters | |
| OUTPUT_FILE=$(echo "$INFER_OUTPUT" | grep -o '### OUTPUT FILE:.* ###' | sed 's/### OUTPUT FILE: \(.*\) ###/\1/') | |
| echo "Got OUTPUT_FILE: $OUTPUT_FILE" | |
| while true; do | |
| echo "### Evaluating on $OUTPUT_FILE ... ###" | |
| COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \ | |
| --eval-num-workers $((N_WORKERS * 2)) \ | |
| --input-file $OUTPUT_FILE \ | |
| --dataset $DATASET \ | |
| --split $SPLIT" | |
| if [ -n "$EVAL_LIMIT" ]; then | |
| echo "EVAL_LIMIT: $EVAL_LIMIT" | |
| COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" | |
| fi | |
| echo "Running command: $COMMAND" | |
| # Run the command | |
| eval $COMMAND | |
| EVAL_STATUS=$? | |
| if [ $EVAL_STATUS -eq 0 ]; then | |
| echo "### Evaluation completed successfully. ###" | |
| break | |
| else | |
| echo "### Evaluation failed with exit code $EVAL_STATUS. Retrying... ###" | |
| fi | |
| ./evaluation/utils/scripts/cleanup_remote_runtime.sh | |
| done | |
| # update the output with evaluation results | |
| echo "### Updating the output with evaluation results... ###" | |
| poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE | |
| echo "### Combining the final completions... ###" | |
| poetry run python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE | |
| echo "### DONE for run $run_idx! ###" | |
| echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE" | |
| done | |