From 6171e4ce6ea0a1187e8cb293da4fcf49b5a04ea0 Mon Sep 17 00:00:00 2001 From: Emilien Bauer Date: Tue, 24 Oct 2023 10:53:53 +0100 Subject: [PATCH 1/3] Use Devito's GPU magic block size in benchmark scripts. --- fast/diffusion_2D_wBCs.py | 10 ++++++++-- fast/diffusion_3D_wBCs.py | 10 ++++++++-- fast/wave2d_b.py | 7 ++++++- fast/wave3d_b.py | 7 ++++++- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fast/diffusion_2D_wBCs.py b/fast/diffusion_2D_wBCs.py index 917454487d..0ed31cee92 100644 --- a/fast/diffusion_2D_wBCs.py +++ b/fast/diffusion_2D_wBCs.py @@ -5,7 +5,7 @@ import argparse import numpy as np -from devito import Grid, TimeFunction, Eq, solve, Operator, Constant, norm, XDSLOperator +from devito import Grid, TimeFunction, Eq, solve, Operator, Constant, norm, XDSLOperator, configuration from examples.cfd import init_hat from fast.bench_utils import plot_2dfunc @@ -54,7 +54,13 @@ init_hat(field=u.data[0], dx=dx, dy=dy, value=1.) if args.devito: - op = Operator([eq_stencil], name='DevitoOperator') + + # To measure Devito at its best on GPU, we have to set the tile siwe manually + opt = None + if configuration['platform'].name == 'nvidiaX': + opt = ('advanced', {'par-tile': (32, 4, 8)}) + + op = Operator([eq_stencil], name='DevitoOperator', opt=opt) op.apply(time=nt, dt=dt, a=nu) print("Devito Field norm is:", norm(u)) diff --git a/fast/diffusion_3D_wBCs.py b/fast/diffusion_3D_wBCs.py index daae383d49..7a90f762eb 100644 --- a/fast/diffusion_3D_wBCs.py +++ b/fast/diffusion_3D_wBCs.py @@ -6,7 +6,7 @@ import numpy as np from devito import (Grid, TimeFunction, Eq, solve, Operator, Constant, - norm, XDSLOperator) + norm, XDSLOperator, configuration) from fast.bench_utils import plot_3dfunc parser = argparse.ArgumentParser(description='Process arguments.') @@ -61,7 +61,13 @@ if args.devito: u.data[:, :, :, :] = 0 u.data[:, :, :, int(nz/2)] = 1 - op = Operator([eq_stencil], name='DevitoOperator') + + # To measure Devito at its best on GPU, we have to set the tile siwe manually + opt = None + if configuration['platform'].name == 'nvidiaX': + opt = ('advanced', {'par-tile': (32, 4, 8)}) + op = Operator([eq_stencil], name='DevitoOperator', opt=opt) + # Apply the operator for a number of timesteps op.apply(time=nt, dt=dt, a=nu) print("Devito Field norm is:", norm(u)) diff --git a/fast/wave2d_b.py b/fast/wave2d_b.py index 54afd28794..b446ab8539 100644 --- a/fast/wave2d_b.py +++ b/fast/wave2d_b.py @@ -89,9 +89,14 @@ print("Init norm:", np.linalg.norm(u.data[:])) if args.devito: + # To measure Devito at its best on GPU, we have to set the tile siwe manually + opt = None + if configuration['platform'].name == 'nvidiaX': + opt = ('advanced', {'par-tile': (32, 4, 8)}) + # Run more with no sources now (Not supported in xdsl) # op1 = Operator([stencil], name='DevitoOperator', subs=grid.spacing_map) - op1 = Operator([stencil], name='DevitoOperator') + op1 = Operator([stencil], name='DevitoOperator', opt=opt) op1.apply(time=nt, dt=dt) configuration['mpi'] = 0 diff --git a/fast/wave3d_b.py b/fast/wave3d_b.py index 0c13f2dba8..daceb38458 100644 --- a/fast/wave3d_b.py +++ b/fast/wave3d_b.py @@ -100,9 +100,14 @@ if args.devito: + # To measure Devito at its best on GPU, we have to set the tile siwe manually + opt = None + if configuration['platform'].name == 'nvidiaX': + opt = ('advanced', {'par-tile': (32, 4, 8)}) + # Run more with no sources now (Not supported in xdsl) # op1 = Operator([stencil], name='DevitoOperator', subs=grid.spacing_map) - op1 = Operator([stencil], name='DevitoOperator') + op1 = Operator([stencil], name='DevitoOperator', opt=opt) op1.apply(time=nt, dt=dt) configuration['mpi'] = 0 From f49d8a719832aa273ae608ff6444b028326ba52b Mon Sep 17 00:00:00 2001 From: Emilien Bauer Date: Tue, 24 Oct 2023 12:03:08 +0100 Subject: [PATCH 2/3] Shell script and slurm file. --- fast/gpu.sh | 36 ++++++++++++++++++++++ fast/slurm-jobs/gpu.slurm | 64 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 fast/gpu.sh create mode 100755 fast/slurm-jobs/gpu.slurm diff --git a/fast/gpu.sh b/fast/gpu.sh new file mode 100644 index 0000000000..ab4a79e547 --- /dev/null +++ b/fast/gpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Simple script to run single threaded benchmarks locally, +# for simple sanity checks. + +# Use the cray compiler, if available. +export DEVITO_PLATFORM=nvidiaX +# Enable debug logging. +export DEVITO_LOGGING=BENCH + +# Just extract the reported throughput from the whole output of the passed command +# For GPU computing, we currently measure only the compute part +# That is, we exclude the data copying to and from the device. +get_throughput() { + #echo $($@) + $@ |& grep section0 | head -n 1 | cut -d ' ' -f10 +} + +# Iterate over benchmarks and cases, print simple CSV data to stdout +# Copy-pastes nicely in Google Sheets +echo bench_name,so,Devito,xDSL +for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "wave3d_b.py -d 512 512 512 --nt 512" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" "diffusion_3D_wBCs.py -d 512 512 512 --nt 512" +# for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" +do + # Get the benchmark file for printing + bench_name=$(echo $bench | cut -d ' ' -f1) + # Iterate over measured space orders + for so in 2 4 8 + do + # Get the throughputs + devito_time=$(get_throughput python $bench -so $so --devito 1) + xdsl_time=$(get_throughput python $bench -so $so --xdsl 1) + # print CSV line + echo $bench_name,$so,$devito_time,$xdsl_time + done +done diff --git a/fast/slurm-jobs/gpu.slurm b/fast/slurm-jobs/gpu.slurm new file mode 100755 index 0000000000..98d0ee77e9 --- /dev/null +++ b/fast/slurm-jobs/gpu.slurm @@ -0,0 +1,64 @@ +#!/bin/bash + +# Slurm job options (job-name, compute nodes, job time) +#SBATCH --job-name=Devito_GPU +#SBATCH --time=00:15:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:1 + +# Replace [budget code] below with your project code (e.g. t01) +#SBATCH --account=d011 +#SBATCH --partition=gpu +#SBATCH --qos=short +#SBATCH -o ./jobs-output/gpu.%j.out # STDOUT + +SHARED=/work/d011/d011/shared +module use $SHARED/modules + +export DEVITO_ARCH=nvc +export DEVITO_PLATFORM=nvidiaX +export DEVITO_LANGUAGE=openacc + +module load sc-23 +cd $SHARED/software/devito/fast + +# Propagate the cpus-per-task setting from script to srun commands +# By default, Slurm does not propagate this setting from the sbatch +# options to srun commands in the job script. If this is not done, +# process/thread pinning may be incorrect leading to poor performance +export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK + +# Simple script to run single threaded benchmarks locally, +# for simple sanity checks. + +# Use the cray compiler, if available. +export DEVITO_PLATFORM=nvidiaX +# Enable debug logging. +export DEVITO_LOGGING=BENCH + +# Just extract the reported throughput from the whole output of the passed command +# For GPU computing, we currently measure only the compute part +# That is, we exclude the data copying to and from the device. +get_throughput() { + #echo $($@) + $@ |& grep section0 | head -n 1 | cut -d ' ' -f10 +} + +# Iterate over benchmarks and cases, print simple CSV data to stdout +# Copy-pastes nicely in Google Sheets +echo bench_name,so,Devito,xDSL +for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "wave3d_b.py -d 512 512 512 --nt 512" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" "diffusion_3D_wBCs.py -d 512 512 512 --nt 512" +# for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" +do + # Get the benchmark file for printing + bench_name=$(echo $bench | cut -d ' ' -f1) + # Iterate over measured space orders + for so in 2 4 8 + do + # Get the throughputs + devito_time=$(get_throughput srun python $bench -so $so --devito 1) + xdsl_time=$(get_throughput srun python $bench -so $so --xdsl 1) + # print CSV line + echo $bench_name,$so,$devito_time,$xdsl_time + done +done From 3f85cd2ab5675e6fa7f5d4fd3b85f82f71afcf5c Mon Sep 17 00:00:00 2001 From: Emilien Bauer Date: Tue, 24 Oct 2023 13:05:05 +0100 Subject: [PATCH 3/3] Scripts were extracting GFLop/s, not GPts/s. --- fast/gpu.sh | 2 +- fast/slurm-jobs/gpu.slurm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fast/gpu.sh b/fast/gpu.sh index ab4a79e547..4d547855e2 100644 --- a/fast/gpu.sh +++ b/fast/gpu.sh @@ -13,7 +13,7 @@ export DEVITO_LOGGING=BENCH # That is, we exclude the data copying to and from the device. get_throughput() { #echo $($@) - $@ |& grep section0 | head -n 1 | cut -d ' ' -f10 + $@ |& grep section0 | head -n 1 | cut -d ' ' -f12 } # Iterate over benchmarks and cases, print simple CSV data to stdout diff --git a/fast/slurm-jobs/gpu.slurm b/fast/slurm-jobs/gpu.slurm index 98d0ee77e9..e9cb504e75 100755 --- a/fast/slurm-jobs/gpu.slurm +++ b/fast/slurm-jobs/gpu.slurm @@ -41,7 +41,7 @@ export DEVITO_LOGGING=BENCH # That is, we exclude the data copying to and from the device. get_throughput() { #echo $($@) - $@ |& grep section0 | head -n 1 | cut -d ' ' -f10 + $@ |& grep section0 | head -n 1 | cut -d ' ' -f12 } # Iterate over benchmarks and cases, print simple CSV data to stdout