diff --git a/fast/- b/fast/- new file mode 100755 index 00000000000..a34fa05f31c Binary files /dev/null and b/fast/- differ diff --git a/fast/async.so b/fast/async.so new file mode 100755 index 00000000000..1d8594a3c66 Binary files /dev/null and b/fast/async.so differ diff --git a/fast/logging.patch b/fast/logging.patch new file mode 100644 index 00000000000..72252a13d76 --- /dev/null +++ b/fast/logging.patch @@ -0,0 +1,55 @@ +diff --git a/devito/logger.py b/devito/logger.py +index 2a7eed95a..b104efd34 100644 +--- a/devito/logger.py ++++ b/devito/logger.py +@@ -16,12 +16,15 @@ logger.addHandler(stream_handler) + # Add extra logging levels (note: INFO has value=20, WARNING has value=30) + DEBUG = logging.DEBUG + PERF = 19 ++BENCH = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + + logging.addLevelName(PERF, "PERF") ++logging.addLevelName(BENCH, "BENCH") ++ + + logger_registry = { + 'DEBUG': DEBUG, +@@ -29,7 +32,8 @@ logger_registry = { + 'INFO': INFO, + 'WARNING': WARNING, + 'ERROR': ERROR, +- 'CRITICAL': CRITICAL ++ 'CRITICAL': CRITICAL, ++ 'BENCH': BENCH + } + + NOCOLOR = '%s' +@@ -133,6 +137,8 @@ def warning(msg, *args, **kwargs): + def error(msg, *args, **kwargs): + log(msg, ERROR, *args, **kwargs) + ++def bench(msg, *args, **kwargs): ++ log(msg, BENCH, *args, **kwargs) + + def debug(msg, *args, **kwargs): + log(msg, DEBUG, *args, **kwargs) +diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py +index f7633a740..2d576df5e 100644 +--- a/devito/operator/profiling.py ++++ b/devito/operator/profiling.py +@@ -460,7 +460,7 @@ class PerformanceSummary(OrderedDict): + + def create_profile(name): + """Create a new Profiler.""" +- if configuration['log-level'] in ['DEBUG', 'PERF'] and \ ++ if configuration['log-level'] in ['DEBUG', 'PERF', 'BENCH'] and \ + configuration['profiling'] == 'basic': + # Enforce performance profiling in DEBUG mode + level = 'advanced' +* Unmerged path fast/slurm/diffusion-16.slurm +* Unmerged path fast/slurm/diffusion-2.slurm +* Unmerged path fast/slurm/diffusion-32.slurm diff --git a/fast/slurm-jobs/diffusion-1.slurm b/fast/slurm-jobs/diffusion-1.slurm index 0d38fdbd150..ee5dc005bee 100644 --- a/fast/slurm-jobs/diffusion-1.slurm +++ b/fast/slurm-jobs/diffusion-1.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/diffusion-128.slurm b/fast/slurm-jobs/diffusion-128.slurm index d78da834dd4..e6c1a8db90f 100644 --- a/fast/slurm-jobs/diffusion-128.slurm +++ b/fast/slurm-jobs/diffusion-128.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/diffusion-16.slurm b/fast/slurm-jobs/diffusion-16.slurm index 540e30aea28..e39179fa6c3 100644 --- a/fast/slurm-jobs/diffusion-16.slurm +++ b/fast/slurm-jobs/diffusion-16.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/diffusion-2.slurm b/fast/slurm-jobs/diffusion-2.slurm index 8d24017ff7b..cbcbe6517d3 100644 --- a/fast/slurm-jobs/diffusion-2.slurm +++ b/fast/slurm-jobs/diffusion-2.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/diffusion-32.slurm b/fast/slurm-jobs/diffusion-32.slurm index d24ec70256b..698248a4744 100644 --- a/fast/slurm-jobs/diffusion-32.slurm +++ b/fast/slurm-jobs/diffusion-32.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/diffusion-4.slurm b/fast/slurm-jobs/diffusion-4.slurm index 0aaa7184563..5f279c5dc21 100644 --- a/fast/slurm-jobs/diffusion-4.slurm +++ b/fast/slurm-jobs/diffusion-4.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/diffusion-64.slurm b/fast/slurm-jobs/diffusion-64.slurm index 8ecea840b30..dfa9f259b9b 100644 --- a/fast/slurm-jobs/diffusion-64.slurm +++ b/fast/slurm-jobs/diffusion-64.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/diffusion-8.slurm b/fast/slurm-jobs/diffusion-8.slurm index 3369a0e7653..57a2385ca79 100644 --- a/fast/slurm-jobs/diffusion-8.slurm +++ b/fast/slurm-jobs/diffusion-8.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/job-devito-xdsl-wave.slurm b/fast/slurm-jobs/job-devito-xdsl-wave.slurm index 98ac935e158..c622ecc6e29 100644 --- a/fast/slurm-jobs/job-devito-xdsl-wave.slurm +++ b/fast/slurm-jobs/job-devito-xdsl-wave.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/job-devito-xdsl.slurm b/fast/slurm-jobs/job-devito-xdsl.slurm index fecbdc0f6a1..001bcff5805 100644 --- a/fast/slurm-jobs/job-devito-xdsl.slurm +++ b/fast/slurm-jobs/job-devito-xdsl.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/wave-1.slurm b/fast/slurm-jobs/wave-1.slurm index f212a6f2e31..5e6d73da32d 100644 --- a/fast/slurm-jobs/wave-1.slurm +++ b/fast/slurm-jobs/wave-1.slurm @@ -39,7 +39,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/wave-128.slurm b/fast/slurm-jobs/wave-128.slurm index d2abb6d1546..553756b7a9f 100644 --- a/fast/slurm-jobs/wave-128.slurm +++ b/fast/slurm-jobs/wave-128.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=diag2 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/wave-16.slurm b/fast/slurm-jobs/wave-16.slurm index c6c547a135e..2e6eb9997b0 100644 --- a/fast/slurm-jobs/wave-16.slurm +++ b/fast/slurm-jobs/wave-16.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/wave-2.slurm b/fast/slurm-jobs/wave-2.slurm index 70fec5de362..a6f270dcf9b 100644 --- a/fast/slurm-jobs/wave-2.slurm +++ b/fast/slurm-jobs/wave-2.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/wave-32.slurm b/fast/slurm-jobs/wave-32.slurm index f234fffc317..0d93f7ab2b0 100644 --- a/fast/slurm-jobs/wave-32.slurm +++ b/fast/slurm-jobs/wave-32.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/wave-4.slurm b/fast/slurm-jobs/wave-4.slurm index ef7d5f0d6be..1f1f529902a 100644 --- a/fast/slurm-jobs/wave-4.slurm +++ b/fast/slurm-jobs/wave-4.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm-jobs/wave-64.slurm b/fast/slurm-jobs/wave-64.slurm index de9cc146c6e..06cdb863db5 100644 --- a/fast/slurm-jobs/wave-64.slurm +++ b/fast/slurm-jobs/wave-64.slurm @@ -36,7 +36,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive diff --git a/fast/slurm-jobs/wave-8.slurm b/fast/slurm-jobs/wave-8.slurm index 7b8e130d769..73454898d48 100644 --- a/fast/slurm-jobs/wave-8.slurm +++ b/fast/slurm-jobs/wave-8.slurm @@ -37,7 +37,7 @@ export OMP_PLACES=cores # Devito-specific env variables export DEVITO_ARCH=cray export DEVITO_LANGUAGE=openmp -export DEVITO_LOGGING=DEBUG +export DEVITO_LOGGING=BENCH export DEVITO_MPI=1 export DEVITO_AUTOTUNING=aggressive # export DEVITO_PROFILING=advanced2 diff --git a/fast/slurm.patch b/fast/slurm.patch new file mode 100644 index 00000000000..c34caee48da --- /dev/null +++ b/fast/slurm.patch @@ -0,0 +1,1217 @@ +diff --git a/fast/slurm/job-devito-xdsl-1.slurm b/fast/slurm/job-devito-xdsl-1.slurm +new file mode 100644 +index 000000000..2da2be2c9 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-1.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:15:00 ++#SBATCH --nodes=1 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-1-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-128.slurm b/fast/slurm/job-devito-xdsl-128.slurm +new file mode 100644 +index 000000000..d71205364 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-128.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:15:00 ++#SBATCH --nodes=128 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-128.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-16.slurm b/fast/slurm/job-devito-xdsl-16.slurm +new file mode 100644 +index 000000000..624128d4d +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-16.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=16 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-16.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-2.slurm b/fast/slurm/job-devito-xdsl-2.slurm +new file mode 100644 +index 000000000..c9bbfbe62 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-2.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=2 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-2.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-32.slurm b/fast/slurm/job-devito-xdsl-32.slurm +new file mode 100644 +index 000000000..2a30c8d15 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-32.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=32 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-32-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-4.slurm b/fast/slurm/job-devito-xdsl-4.slurm +new file mode 100644 +index 000000000..dc6bd7157 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-4.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=4 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-4-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-64.slurm b/fast/slurm/job-devito-xdsl-64.slurm +new file mode 100644 +index 000000000..fa872a726 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-64.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:12:00 ++#SBATCH --nodes=64 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-64.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-8.slurm b/fast/slurm/job-devito-xdsl-8.slurm +new file mode 100644 +index 000000000..ea435be7e +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-8.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=8 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-8.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 diffusion_3D_wBCs.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/job-devito-xdsl-wave.slurm b/fast/slurm/job-devito-xdsl-wave.slurm +new file mode 100644 +index 000000000..98ac935e1 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl-wave.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=1 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-1-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++# srun --distribution=block:block --hint=nomultithread python3 run_benchmark.py 2d5pt -nt 100 --xdsl --devito --openmp --mpi -d 2000 2000 --repeat 1 ++srun --distribution=block:block --hint=nomultithread python3 nd_nwave_devito_nodamp.py -d 300 300 300 --nt 100 ++ +diff --git a/fast/slurm/job-devito-xdsl.slurm b/fast/slurm/job-devito-xdsl.slurm +new file mode 100644 +index 000000000..fecbdc0f6 +--- /dev/null ++++ b/fast/slurm/job-devito-xdsl.slurm +@@ -0,0 +1,59 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=1 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-1-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++srun --distribution=block:block --hint=nomultithread python3 run_benchmark.py 2d5pt -nt 100 --xdsl --devito --openmp --mpi -d 2000 2000 --repeat 1 +diff --git a/fast/slurm/wave-job-nn1.slurm b/fast/slurm/wave-job-nn1.slurm +new file mode 100644 +index 000000000..09345ef9b +--- /dev/null ++++ b/fast/slurm/wave-job-nn1.slurm +@@ -0,0 +1,65 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:20:00 ++#SBATCH --nodes=1 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-1-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export TMPDIR=/work/d011/d011/shared/temp/ ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++ ++# DEVITO_MPI=0 python setup_wave3d.py -d 1024 1024 1024 --nt 512 -so 4 ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn128.slurm b/fast/slurm/wave-job-nn128.slurm +new file mode 100644 +index 000000000..2fd2e19d2 +--- /dev/null ++++ b/fast/slurm/wave-job-nn128.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=128 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-128.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=diag2 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn16.slurm b/fast/slurm/wave-job-nn16.slurm +new file mode 100644 +index 000000000..f5eb65bef +--- /dev/null ++++ b/fast/slurm/wave-job-nn16.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=16 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-16.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn2.slurm b/fast/slurm/wave-job-nn2.slurm +new file mode 100644 +index 000000000..69f5345ad +--- /dev/null ++++ b/fast/slurm/wave-job-nn2.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:20:00 ++#SBATCH --nodes=2 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-2.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn32.slurm b/fast/slurm/wave-job-nn32.slurm +new file mode 100644 +index 000000000..302f4b160 +--- /dev/null ++++ b/fast/slurm/wave-job-nn32.slurm +@@ -0,0 +1,62 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:15:00 ++#SBATCH --nodes=32 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-32.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn4.slurm b/fast/slurm/wave-job-nn4.slurm +new file mode 100644 +index 000000000..6a3d2b8e5 +--- /dev/null ++++ b/fast/slurm/wave-job-nn4.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=4 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-4-full.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn64.slurm b/fast/slurm/wave-job-nn64.slurm +new file mode 100644 +index 000000000..27a14f9dd +--- /dev/null ++++ b/fast/slurm/wave-job-nn64.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:10:00 ++#SBATCH --nodes=64 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-64.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++ ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1 +diff --git a/fast/slurm/wave-job-nn8.slurm b/fast/slurm/wave-job-nn8.slurm +new file mode 100644 +index 000000000..69008296b +--- /dev/null ++++ b/fast/slurm/wave-job-nn8.slurm +@@ -0,0 +1,61 @@ ++#!/bin/bash ++ ++# Slurm job options (job-name, compute nodes, job time) ++#SBATCH --job-name=Devito_MPI_Job ++#SBATCH --time=00:08:00 ++#SBATCH --nodes=8 ++#SBATCH --ntasks-per-node=8 ++#SBATCH --cpus-per-task=16 ++#SBATCH --switches=1@360 # Each group has 128 nodes ++ ++# Replace [budget code] below with your project code (e.g. t01) ++#SBATCH --account=d011 ++#SBATCH --partition=standard ++#SBATCH --qos=standard ++#SBATCH -o ./jobs-output/output-8.%j.out # STDOUT ++ ++# Propagate the cpus-per-task setting from script to srun commands ++# By default, Slurm does not propagate this setting from the sbatch ++# options to srun commands in the job script. If this is not done, ++# process/thread pinning may be incorrect leading to poor performance ++export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK ++ ++export SHARED=/work/d011/d011/shared ++module use $SHARED/modules ++module load sc-23 ++module load cray-mpich ++ ++ ++cd $SHARED/software/devito/fast ++ ++# Set the number of threads to 16 and specify placement ++# There are 16 OpenMP threads per MPI process ++# We want one thread per physical core ++export OMP_NUM_THREADS=16 ++export OMP_PLACES=cores ++ ++# Devito-specific env variables ++export DEVITO_ARCH=cray ++export DEVITO_LANGUAGE=openmp ++export DEVITO_LOGGING=DEBUG ++export DEVITO_MPI=1 ++export DEVITO_AUTOTUNING=aggressive ++# export DEVITO_PROFILING=advanced2 ++ ++# Archer specific ++# export MPICH_OFI_STARTUP_CONNECT=1 ++# export MPICH_OFI_RMA_STARTUP_CONNECT=1 ++export FI_OFI_RXM_SAR_LIMIT=524288 ++export FI_OFI_RXM_BUFFER_SIZE=131072 ++export MPICH_SMP_SINGLE_COPY_SIZE=16384 ++export CRAY_OMP_CHECK_AFFINITY=TRUE ++export SLURM_CPU_FREQ_REQ=2250000 ++ ++# Launch the parallel job ++# Using nodes x ntasks-per-node MPI processes ++# 8 MPI processes per node ++# 16 OpenMP threads per MPI process ++# Additional srun options to pin one thread per physical core ++ ++DEVITO_MPI=diag2 srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --devito 1 ++srun --distribution=block:block --hint=nomultithread python3 wave3d_b.py -d 1024 1024 1024 --nt 512 -so 4 --xdsl 1