Spaces:
Sleeping
Sleeping
| # Cluster connection configuration | |
| CLUSTER_HOST="killarney" | |
| CLUSTER_USER="gsa" | |
| # Job configuration | |
| ACCOUNT="aip-craffel" | |
| SCRIPT_NAME="gradio_job.slurm" | |
| APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy" | |
| APP_PATH="app.py" | |
| JOB_NAME="gradio-app" | |
| GPU_TYPE="l40s" | |
| NUM_GPUS=1 | |
| NODES=1 | |
| NTASKS_PER_NODE=1 | |
| CPUS_PER_TASK=4 | |
| ### request more memory to run on more models | |
| MEM="16G" | |
| TIME="02:00:00" | |
| GRADIO_PORT=7861 | |
| script_location="$APP_DIR/$SCRIPT_NAME" | |
| ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate" | |
| OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm" | |
| # Function to cleanup temporary files | |
| cleanup() { | |
| echo "Cleaning up..." | |
| if [ -f "$SCRIPT_NAME" ]; then | |
| rm "$SCRIPT_NAME" | |
| fi | |
| exit 0 | |
| } | |
| # Set trap for cleanup on script exit | |
| trap cleanup EXIT INT TERM | |
| # Generate SLURM job script locally | |
| cat > "$SCRIPT_NAME" << EOF | |
| #!/bin/bash | |
| #SBATCH --job-name=$JOB_NAME | |
| #SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS | |
| #SBATCH --nodes=$NODES | |
| #SBATCH --ntasks-per-node=$NTASKS_PER_NODE | |
| #SBATCH --cpus-per-task=$CPUS_PER_TASK | |
| #SBATCH --mem=$MEM | |
| #SBATCH --time=$TIME | |
| #SBATCH --account=$ACCOUNT | |
| #SBATCH --output=$OUTPUT_DIR/%j.out | |
| # Print job info | |
| echo "Job started on node: \$(hostname)" | |
| echo "Job ID: \$SLURM_JOB_ID" | |
| echo "Allocated nodes: \$SLURM_JOB_NODELIST" | |
| echo "Working directory: \$(pwd)" | |
| echo "Starting time: \$(date)" | |
| source /home/$CLUSTER_USER/.bashrc | |
| # Load necessary modules | |
| module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13 | |
| # Activate virtual environment | |
| source "${ENV_PATH}" | |
| echo $HF_TOKEN | |
| hf auth login --token $HF_TOKEN | |
| hf auth whoami | |
| # Set up environment | |
| export GRADIO_SERVER_NAME="0.0.0.0" | |
| export GRADIO_SERVER_PORT=$GRADIO_PORT | |
| # Start Gradio app | |
| echo "Starting Gradio app on port ${GRADIO_PORT}..." | |
| gradio "${APP_PATH}" --watch-dirs "${APP_DIR}" | |
| # python "${APP_PATH}" --watch-dirs "${APP_DIR}" | |
| # Keep the job alive | |
| echo "Gradio app finished at: \$(date)" | |
| EOF | |
| echo "Generated SLURM job script: $SCRIPT_NAME" | |
| # Transfer the job script to the cluster and submit it | |
| scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location" | |
| if [ $? -ne 0 ]; then | |
| echo "Error: Failed to transfer job script to cluster" | |
| exit 1 | |
| fi | |
| echo "Submitting job to cluster..." | |
| JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \ | |
| "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \ | |
| | tr -d '\r\n') | |
| if [ $? -ne 0 ]; then | |
| echo "Error: Failed to submit job to cluster" | |
| exit 1 | |
| fi | |
| echo "Job submitted with ID: $JOB_ID" | |
| # Monitor job status from local machine | |
| echo "Monitoring job status from local machine..." | |
| while true; do | |
| JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'") | |
| echo "Job status: $JOB_STATUS" | |
| if [ -z "$JOB_STATUS" ]; then | |
| echo "Error: Job $JOB_ID not found. It may have failed to start." | |
| echo "Checking job output..." | |
| ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" | |
| exit 1 | |
| elif [ "$JOB_STATUS" = "RUNNING" ]; then | |
| echo "Job is now running!" | |
| break | |
| elif [ "$JOB_STATUS" = "PENDING" ]; then | |
| echo "Job is pending... (waiting for resources)" | |
| sleep 5 | |
| else | |
| echo "Job status: $JOB_STATUS" | |
| if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then | |
| echo "Job ended with status: $JOB_STATUS" | |
| echo "Checking job output files..." | |
| ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" | |
| exit 1 | |
| fi | |
| sleep 5 | |
| fi | |
| done | |
| # Get the allocated node | |
| NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") | |
| echo "Job (${JOB_ID}) is running on node: ${NODE}" | |
| # Wait a moment for the Gradio app to start | |
| echo "Waiting for Gradio app to initialize..." | |
| sleep 10 | |
| # Check if Gradio is actually running | |
| echo "Checking if Gradio app started successfully..." | |
| GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") | |
| # Get NODE locally | |
| NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ | |
| "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") | |
| # Check Gradio process on that node | |
| GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ | |
| "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") | |
| # Handle process check | |
| if [ -n "$GRADIO_CHECK" ]; then | |
| echo "✓ Gradio app appears to be running" | |
| else | |
| echo "⚠ Warning: Gradio app may not have started properly" | |
| echo "Check the job output:" | |
| ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'" | |
| fi | |
| cancel_job() { | |
| read -p "Would you like to cancel the job? (y/n): " -n 1 -r | |
| if [[ $REPLY =~ ^[Yy]$ ]]; then | |
| ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '" | |
| else | |
| echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE" | |
| fi | |
| } | |
| # Optional port forwarding | |
| read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r | |
| echo "" | |
| if [[ $REPLY =~ ^[Yy]$ ]]; then | |
| # If GRADIO_PORT is in use locally, pick a random free port | |
| if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then | |
| echo "Port $GRADIO_PORT is already in use locally — selecting a free one..." | |
| LOCAL_PORT=$(comm -23 \ | |
| <(seq 1024 65535 | sort) \ | |
| <(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \ | |
| | awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}') | |
| else | |
| LOCAL_PORT="$GRADIO_PORT" | |
| fi | |
| echo "Using local port: $LOCAL_PORT" | |
| echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app." | |
| ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \ | |
| -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash" | |
| echo "" | |
| echo "Port forwarding ended." | |
| cancel_job | |
| else | |
| echo "Skipping port forwarding." | |
| # Connection info | |
| cat <<EOF | |
| ========================================= | |
| Gradio app should be running on: | |
| Cluster: $CLUSTER_HOST | |
| Node: $NODE | |
| Port: $GRADIO_PORT | |
| To access from your local machine: | |
| ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST | |
| Then open: http://localhost:$GRADIO_PORT | |
| Alternative direct SSH with forwarding: | |
| ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST | |
| Check job status: | |
| ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\" | |
| Cancel job: | |
| ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\" | |
| ========================================= | |
| EOF | |
| echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST" | |
| fi | |