cleanup data folder
This commit is contained in:
9
slurm/start_head.sh
Normal file
9
slurm/start_head.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "starting ray head node"
|
||||
|
||||
# Launch the head node
|
||||
ray start --head --node-ip-address=$1 --port=6379 --redis-password=$2 --num-cpus=$SLURM_CPUS_PER_TASK --dashboard-host=127.0.0.1
|
||||
|
||||
|
||||
sleep infinity
|
||||
5
slurm/start_worker.sh
Normal file
5
slurm/start_worker.sh
Normal file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
head /proc/cpuinfo
|
||||
echo "starting ray worker node"
|
||||
ray start --address $1 --redis-password=$2 --num-cpus=$SLURM_CPUS_PER_TASK
|
||||
sleep infinity
|
||||
52
slurm/submit_job_template.txt
Normal file
52
slurm/submit_job_template.txt
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
#SBATCH --time={time}
|
||||
#SBATCH --job-name="{job_name}"
|
||||
|
||||
### This script works for any number of nodes, Ray will find and manage all resources
|
||||
#SBATCH --nodes={nodes}
|
||||
|
||||
### Give all resources to a single Ray task, ray can manage the resources internally
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task={cpus_per_node}
|
||||
{dependency}
|
||||
|
||||
# Load modules or your own conda environment here
|
||||
{environment_setup}
|
||||
|
||||
# This script is a modification to the implementation suggest by gregSchwartz18 here:
|
||||
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
|
||||
redis_password=$(uuidgen)
|
||||
export redis_password
|
||||
|
||||
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
|
||||
nodes_array=( $nodes )
|
||||
|
||||
node_1=${{nodes_array[0]}}
|
||||
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
|
||||
echo $ip > ~/ip_head
|
||||
port=6379
|
||||
ip_head=$ip:$port
|
||||
export ip_head
|
||||
export RAY_ADDRESS=$ip_head
|
||||
echo "IP Head: $ip_head"
|
||||
|
||||
ip_submit=$(hostname --ip-address)
|
||||
|
||||
echo "STARTING HEAD at $node_1"
|
||||
srun --nodes=1 --ntasks=1 -w $node_1 start_head.sh $ip $redis_password &
|
||||
sleep 5
|
||||
|
||||
|
||||
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
|
||||
for (( i=1; i<=$worker_num; i++ ))
|
||||
do
|
||||
node_i=${{nodes_array[$i]}}
|
||||
echo "STARTING WORKER $i at $node_i"
|
||||
srun --nodes=1 --ntasks=1 -w $node_i start_worker.sh $ip_head $redis_password &
|
||||
sleep 5
|
||||
done
|
||||
|
||||
#### call your code below
|
||||
scgenerator {command} {config}
|
||||
exit
|
||||
Reference in New Issue
Block a user