#!/bin/bash #SBATCH --time={time} #SBATCH --job-name="{job_name}" ### This script works for any number of nodes, Ray will find and manage all resources #SBATCH --nodes={nodes} ### Give all resources to a single Ray task, ray can manage the resources internally #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task={cpus_per_node} {dependency} # Load modules or your own conda environment here {environment_setup} # This script is a modification to the implementation suggest by gregSchwartz18 here: # https://github.com/ray-project/ray/issues/826#issuecomment-522116599 redis_password=$(uuidgen) export redis_password nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names nodes_array=( $nodes ) node_1=${{nodes_array[0]}} ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address echo $ip > ~/ip_head port=6379 ip_head=$ip:$port export ip_head export RAY_ADDRESS=$ip_head echo "IP Head: $ip_head" ip_submit=$(hostname --ip-address) echo "STARTING HEAD at $node_1" srun --nodes=1 --ntasks=1 -w $node_1 start_head.sh $ip $redis_password & sleep 5 worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node for (( i=1; i<=$worker_num; i++ )) do node_i=${{nodes_array[$i]}} echo "STARTING WORKER $i at $node_i" srun --nodes=1 --ntasks=1 -w $node_i start_worker.sh $ip_head $redis_password & sleep 5 done #### call your code below scgenerator {command} {config} exit