Spaces:
Running
Running
| CONFIG=$1 | |
| GPUS=$2 | |
| MODEL_NAME=$(basename "$(dirname $CONFIG)") | |
| PORT=10902 | |
| while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus | |
| do | |
| torchrun --nproc_per_node=$GPUS \ | |
| --master_port=$PORT \ | |
| train.py --c $CONFIG --model $MODEL_NAME | |
| for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}') | |
| do | |
| echo $PID | |
| kill -9 $PID | |
| done | |
| sleep 30 | |
| done |