From 6171e4ce6ea0a1187e8cb293da4fcf49b5a04ea0 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 24 Oct 2023 10:53:53 +0100
Subject: [PATCH 1/3] Use Devito's GPU magic block size in benchmark scripts.

---
 fast/diffusion_2D_wBCs.py | 10 ++++++++--
 fast/diffusion_3D_wBCs.py | 10 ++++++++--
 fast/wave2d_b.py          |  7 ++++++-
 fast/wave3d_b.py          |  7 ++++++-
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/fast/diffusion_2D_wBCs.py b/fast/diffusion_2D_wBCs.py
index 917454487d..0ed31cee92 100644
--- a/fast/diffusion_2D_wBCs.py
+++ b/fast/diffusion_2D_wBCs.py
@@ -5,7 +5,7 @@
 import argparse
 import numpy as np
 
-from devito import Grid, TimeFunction, Eq, solve, Operator, Constant, norm, XDSLOperator
+from devito import Grid, TimeFunction, Eq, solve, Operator, Constant, norm, XDSLOperator, configuration
 from examples.cfd import init_hat
 from fast.bench_utils import plot_2dfunc
 
@@ -54,7 +54,13 @@
 init_hat(field=u.data[0], dx=dx, dy=dy, value=1.)
 
 if args.devito:
-    op = Operator([eq_stencil], name='DevitoOperator')
+
+    # To measure Devito at its best on GPU, we have to set the tile siwe manually
+    opt = None
+    if configuration['platform'].name == 'nvidiaX':
+        opt = ('advanced', {'par-tile': (32, 4, 8)})
+
+    op = Operator([eq_stencil], name='DevitoOperator', opt=opt)
     op.apply(time=nt, dt=dt, a=nu)
     print("Devito Field norm is:", norm(u))
 
diff --git a/fast/diffusion_3D_wBCs.py b/fast/diffusion_3D_wBCs.py
index daae383d49..7a90f762eb 100644
--- a/fast/diffusion_3D_wBCs.py
+++ b/fast/diffusion_3D_wBCs.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from devito import (Grid, TimeFunction, Eq, solve, Operator, Constant,
-                    norm, XDSLOperator)
+                    norm, XDSLOperator, configuration)
 from fast.bench_utils import plot_3dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
@@ -61,7 +61,13 @@
 if args.devito:
     u.data[:, :, :, :] = 0
     u.data[:, :, :, int(nz/2)] = 1
-    op = Operator([eq_stencil], name='DevitoOperator')
+
+    # To measure Devito at its best on GPU, we have to set the tile siwe manually
+    opt = None
+    if configuration['platform'].name == 'nvidiaX':
+        opt = ('advanced', {'par-tile': (32, 4, 8)})
+    op = Operator([eq_stencil], name='DevitoOperator', opt=opt)
+
     # Apply the operator for a number of timesteps
     op.apply(time=nt, dt=dt, a=nu)
     print("Devito Field norm is:", norm(u))
diff --git a/fast/wave2d_b.py b/fast/wave2d_b.py
index 54afd28794..b446ab8539 100644
--- a/fast/wave2d_b.py
+++ b/fast/wave2d_b.py
@@ -89,9 +89,14 @@
 print("Init norm:", np.linalg.norm(u.data[:]))
 
 if args.devito:
+    # To measure Devito at its best on GPU, we have to set the tile siwe manually
+    opt = None
+    if configuration['platform'].name == 'nvidiaX':
+        opt = ('advanced', {'par-tile': (32, 4, 8)})
+
     # Run more with no sources now (Not supported in xdsl)
     # op1 = Operator([stencil], name='DevitoOperator', subs=grid.spacing_map)
-    op1 = Operator([stencil], name='DevitoOperator')
+    op1 = Operator([stencil], name='DevitoOperator', opt=opt)
     op1.apply(time=nt, dt=dt)
 
     configuration['mpi'] = 0
diff --git a/fast/wave3d_b.py b/fast/wave3d_b.py
index 0c13f2dba8..daceb38458 100644
--- a/fast/wave3d_b.py
+++ b/fast/wave3d_b.py
@@ -100,9 +100,14 @@
 
 
 if args.devito:
+    # To measure Devito at its best on GPU, we have to set the tile siwe manually
+    opt = None
+    if configuration['platform'].name == 'nvidiaX':
+        opt = ('advanced', {'par-tile': (32, 4, 8)})
+
     # Run more with no sources now (Not supported in xdsl)
     # op1 = Operator([stencil], name='DevitoOperator', subs=grid.spacing_map)
-    op1 = Operator([stencil], name='DevitoOperator')
+    op1 = Operator([stencil], name='DevitoOperator', opt=opt)
     op1.apply(time=nt, dt=dt)
 
     configuration['mpi'] = 0

From f49d8a719832aa273ae608ff6444b028326ba52b Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 24 Oct 2023 12:03:08 +0100
Subject: [PATCH 2/3] Shell script and slurm file.

---
 fast/gpu.sh               | 36 ++++++++++++++++++++++
 fast/slurm-jobs/gpu.slurm | 64 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 fast/gpu.sh
 create mode 100755 fast/slurm-jobs/gpu.slurm

diff --git a/fast/gpu.sh b/fast/gpu.sh
new file mode 100644
index 0000000000..ab4a79e547
--- /dev/null
+++ b/fast/gpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Simple script to run single threaded benchmarks locally,
+# for simple sanity checks.
+
+# Use the cray compiler, if available.
+export DEVITO_PLATFORM=nvidiaX
+# Enable debug logging.
+export DEVITO_LOGGING=BENCH
+
+# Just extract the reported throughput from the whole output of the passed command
+# For GPU computing, we currently measure only the compute part
+# That is, we exclude the data copying to and from the device.
+get_throughput() {
+    #echo $($@)
+    $@ |& grep section0 | head -n 1 | cut -d ' ' -f10
+}
+
+# Iterate over benchmarks and cases, print simple CSV data to stdout
+# Copy-pastes nicely in Google Sheets
+echo bench_name,so,Devito,xDSL
+for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "wave3d_b.py -d 512 512 512 --nt 512" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" "diffusion_3D_wBCs.py -d 512 512 512 --nt 512"
+# for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024"
+do
+  # Get the benchmark file for printing
+  bench_name=$(echo $bench | cut -d ' ' -f1)
+  # Iterate over measured space orders
+  for so in 2 4 8
+    do
+      # Get the throughputs
+      devito_time=$(get_throughput python $bench -so $so --devito 1)
+      xdsl_time=$(get_throughput python $bench -so $so --xdsl 1)
+      # print CSV line
+      echo $bench_name,$so,$devito_time,$xdsl_time
+  done
+done
diff --git a/fast/slurm-jobs/gpu.slurm b/fast/slurm-jobs/gpu.slurm
new file mode 100755
index 0000000000..98d0ee77e9
--- /dev/null
+++ b/fast/slurm-jobs/gpu.slurm
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Slurm job options (job-name, compute nodes, job time)
+#SBATCH --job-name=Devito_GPU
+#SBATCH --time=00:15:00
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+
+# Replace [budget code] below with your project code (e.g. t01)
+#SBATCH --account=d011
+#SBATCH --partition=gpu
+#SBATCH --qos=short
+#SBATCH -o ./jobs-output/gpu.%j.out # STDOUT
+
+SHARED=/work/d011/d011/shared
+module use $SHARED/modules
+
+export DEVITO_ARCH=nvc
+export DEVITO_PLATFORM=nvidiaX
+export DEVITO_LANGUAGE=openacc
+
+module load sc-23
+cd $SHARED/software/devito/fast
+
+# Propagate the cpus-per-task setting from script to srun commands
+#    By default, Slurm does not propagate this setting from the sbatch
+#    options to srun commands in the job script. If this is not done,
+#    process/thread pinning may be incorrect leading to poor performance
+export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
+
+# Simple script to run single threaded benchmarks locally,
+# for simple sanity checks.
+
+# Use the cray compiler, if available.
+export DEVITO_PLATFORM=nvidiaX
+# Enable debug logging.
+export DEVITO_LOGGING=BENCH
+
+# Just extract the reported throughput from the whole output of the passed command
+# For GPU computing, we currently measure only the compute part
+# That is, we exclude the data copying to and from the device.
+get_throughput() {
+    #echo $($@)
+    $@ |& grep section0 | head -n 1 | cut -d ' ' -f10
+}
+
+# Iterate over benchmarks and cases, print simple CSV data to stdout
+# Copy-pastes nicely in Google Sheets
+echo bench_name,so,Devito,xDSL
+for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "wave3d_b.py -d 512 512 512 --nt 512" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024" "diffusion_3D_wBCs.py -d 512 512 512 --nt 512"
+# for bench in "wave2d_b.py -d 8192 8192 --nt 1024" "diffusion_2D_wBCs.py -d 8192 8192 --nt 1024"
+do
+  # Get the benchmark file for printing
+  bench_name=$(echo $bench | cut -d ' ' -f1)
+  # Iterate over measured space orders
+  for so in 2 4 8
+    do
+      # Get the throughputs
+      devito_time=$(get_throughput srun python $bench -so $so --devito 1)
+      xdsl_time=$(get_throughput srun python $bench -so $so --xdsl 1)
+      # print CSV line
+      echo $bench_name,$so,$devito_time,$xdsl_time
+  done
+done

From 3f85cd2ab5675e6fa7f5d4fd3b85f82f71afcf5c Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 24 Oct 2023 13:05:05 +0100
Subject: [PATCH 3/3] Scripts were extracting GFLop/s, not GPts/s.

---
 fast/gpu.sh               | 2 +-
 fast/slurm-jobs/gpu.slurm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fast/gpu.sh b/fast/gpu.sh
index ab4a79e547..4d547855e2 100644
--- a/fast/gpu.sh
+++ b/fast/gpu.sh
@@ -13,7 +13,7 @@ export DEVITO_LOGGING=BENCH
 # That is, we exclude the data copying to and from the device.
 get_throughput() {
     #echo $($@)
-    $@ |& grep section0 | head -n 1 | cut -d ' ' -f10
+    $@ |& grep section0 | head -n 1 | cut -d ' ' -f12
 }
 
 # Iterate over benchmarks and cases, print simple CSV data to stdout
diff --git a/fast/slurm-jobs/gpu.slurm b/fast/slurm-jobs/gpu.slurm
index 98d0ee77e9..e9cb504e75 100755
--- a/fast/slurm-jobs/gpu.slurm
+++ b/fast/slurm-jobs/gpu.slurm
@@ -41,7 +41,7 @@ export DEVITO_LOGGING=BENCH
 # That is, we exclude the data copying to and from the device.
 get_throughput() {
     #echo $($@)
-    $@ |& grep section0 | head -n 1 | cut -d ' ' -f10
+    $@ |& grep section0 | head -n 1 | cut -d ' ' -f12
 }
 
 # Iterate over benchmarks and cases, print simple CSV data to stdout