Skip to content

Commit

Permalink
Merge pull request #29 from xdslproject/emilien/archer2-slurm
Browse files Browse the repository at this point in the history
Get those slurm batch files out of Archer2.
  • Loading branch information
PapyChacal authored Oct 13, 2023
2 parents 8e8f127 + b6b0419 commit aacb7fd
Show file tree
Hide file tree
Showing 19 changed files with 1,109 additions and 3 deletions.
3 changes: 0 additions & 3 deletions devito/mpi/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,6 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None):
# mpi4py takes care of that when the object gets out of scope
self._input_comm = (input_comm or MPI.COMM_WORLD).Clone()

if len(shape) == 3:
topology = ('*', '*', 1)

if topology is None:
# `MPI.Compute_dims` sets the dimension sizes to be as close to each other
# as possible, using an appropriate divisibility algorithm. Thus, in 3D:
Expand Down
61 changes: 61 additions & 0 deletions fast/slurm-jobs/diffusion-1.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:15:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-1.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive
# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
62 changes: 62 additions & 0 deletions fast/slurm-jobs/diffusion-128.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:15:00
#SBATCH --nodes=128
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-128.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive

# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
62 changes: 62 additions & 0 deletions fast/slurm-jobs/diffusion-16.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:10:00
#SBATCH --nodes=16
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-16.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive

# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
62 changes: 62 additions & 0 deletions fast/slurm-jobs/diffusion-2.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:10:00
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-2.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive

# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
62 changes: 62 additions & 0 deletions fast/slurm-jobs/diffusion-32.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:10:00
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-32.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive

# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
61 changes: 61 additions & 0 deletions fast/slurm-jobs/diffusion-4.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash

# Slurm job options (job-name, compute nodes, job time)
#SBATCH --job-name=Devito_MPI_Job
#SBATCH --time=00:10:00
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=16
#SBATCH --switches=1@360 # Each group has 128 nodes

# Replace [budget code] below with your project code (e.g. t01)
#SBATCH --account=d011
#SBATCH --partition=standard
#SBATCH --qos=standard
#SBATCH -o ./jobs-output/diffusion-4.%j.out # STDOUT

# Propagate the cpus-per-task setting from script to srun commands
# By default, Slurm does not propagate this setting from the sbatch
# options to srun commands in the job script. If this is not done,
# process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

export SHARED=/work/d011/d011/shared
module use $SHARED/modules
module load sc-23
module load cray-mpich


cd $SHARED/software/devito/fast

# Set the number of threads to 16 and specify placement
# There are 16 OpenMP threads per MPI process
# We want one thread per physical core
export OMP_NUM_THREADS=16
export OMP_PLACES=cores

# Devito-specific env variables
export DEVITO_ARCH=cray
export DEVITO_LANGUAGE=openmp
export DEVITO_LOGGING=BENCH
export DEVITO_MPI=1
export DEVITO_AUTOTUNING=aggressive
# export DEVITO_PROFILING=advanced2

# Archer specific
# export MPICH_OFI_STARTUP_CONNECT=1
# export MPICH_OFI_RMA_STARTUP_CONNECT=1
export FI_OFI_RXM_SAR_LIMIT=524288
export FI_OFI_RXM_BUFFER_SIZE=131072
export MPICH_SMP_SINGLE_COPY_SIZE=16384
export CRAY_OMP_CHECK_AFFINITY=TRUE
export SLURM_CPU_FREQ_REQ=2250000

# Launch the parallel job
# Using nodes x ntasks-per-node MPI processes
# 8 MPI processes per node
# 16 OpenMP threads per MPI process
# Additional srun options to pin one thread per physical core

DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1
srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1
Loading

0 comments on commit aacb7fd

Please sign in to comment.