52 lines
1.4 KiB
Bash
52 lines
1.4 KiB
Bash
#!/bin/bash
|
|
|
|
#SBATCH --time={time}
|
|
#SBATCH --job-name="{job_name}"
|
|
|
|
### This script works for any number of nodes, Ray will find and manage all resources
|
|
#SBATCH --nodes={nodes}
|
|
|
|
### Give all resources to a single Ray task, ray can manage the resources internally
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --cpus-per-task={cpus_per_node}
|
|
{dependency}
|
|
|
|
# Load modules or your own conda environment here
|
|
{environment_setup}
|
|
|
|
# This script is a modification to the implementation suggest by gregSchwartz18 here:
|
|
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
|
|
redis_password=$(uuidgen)
|
|
export redis_password
|
|
|
|
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
|
|
nodes_array=( $nodes )
|
|
|
|
node_1=${{nodes_array[0]}}
|
|
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
|
|
echo $ip > ~/ip_head
|
|
port=6379
|
|
ip_head=$ip:$port
|
|
export ip_head
|
|
export RAY_ADDRESS=$ip_head
|
|
echo "IP Head: $ip_head"
|
|
|
|
ip_submit=$(hostname --ip-address)
|
|
|
|
echo "STARTING HEAD at $node_1"
|
|
srun --nodes=1 --ntasks=1 -w $node_1 start_head.sh $ip $redis_password &
|
|
sleep 5
|
|
|
|
|
|
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
|
|
for (( i=1; i<=$worker_num; i++ ))
|
|
do
|
|
node_i=${{nodes_array[$i]}}
|
|
echo "STARTING WORKER $i at $node_i"
|
|
srun --nodes=1 --ntasks=1 -w $node_i start_worker.sh $ip_head $redis_password &
|
|
sleep 5
|
|
done
|
|
|
|
#### call your code below
|
|
scgenerator {command} {config}
|
|
exit |