twitter-algorithm-ml/projects/twhin/scripts/run_in_slurm.sh
2023-03-31 10:31:35 -05:00

36 lines
932 B
Bash
Executable file

#!/usr/bin/env bash
set -e
cd "$(git rev-parse --show-toplevel)" || exit
cd ..
export PYTHONPATH="$(pwd)"
export TASK_TYPE="chief"
ENDPOINT=$(python tml/machines/list_ops.py --op=select --input_list="$SLURM_JOB_NODELIST" --elem=0 --sep=',')
NNODES=$(python tml/machines/list_ops.py --op=len --input_list="$SLURM_JOB_NODELIST" --sep=',')
echo "NNODES: $NNODES"
if [[ $NNODES == "1" ]];
then
echo "Single Trainer."
torchrun \
--standalone \
--nnodes 1 \
--nproc_per_node 2 \
tml/projects/twhin/run.py \
--config_yaml_path="/home/$USER/workspace/tml/projects/twhin/config/slurm.yaml" \
else
echo "Multiple Trainers: ${NNODES} nodes."
torchrun \
--rdzv_backend c10d \
--rdzv_endpoint $ENDPOINT \
--rdzv_id "$SLURM_JOB_ID" \
--nnodes "$NNODES" \
--nproc_per_node 2 \
--role "" \
tml/projects/twhin/run.py \
--config_yaml_path="/home/$USER/workspace/tml/projects/twhin/config/slurm.yaml"
fi