Update autotuning scripts for GPU kernels

cp2k · Jun 27, 2023 · cceb3f9 · cceb3f9
1 parent f252567
commit cceb3f9
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 44 deletions.
diff --git a/src/acc/libsmm_acc/tune/tune_collect.py b/src/acc/libsmm_acc/tune/tune_collect.py
@@ -53,13 +53,10 @@ def main(tune_dir=Path(".")):
         if not dir.is_dir():
             continue
 
-        for exe_fpath in sorted(dir.glob("tune_*main.c*")):
-            mnk = tuple(int(i) for i in re_mnk.search(exe_fpath.name).groups())
+        for log_fpath in sorted(dir.glob("tune_*.log")):
+            mnk = tuple(int(i) for i in re_mnk.search(log_fpath.name).groups())
             if mnk not in winners:
                 winners[mnk] = awinner()
-            log_fpath = exe_fpath.parent / exe_fpath.name.replace(
-                "_main.cu", ".log"
-            ).replace("_main.cpp", ".log")
             if not log_fpath.exists():
                 winners[mnk] = awinner(value=f"log missing: {log_fpath}", missing=1)
                 print(
@@ -100,7 +97,7 @@ def main(tune_dir=Path(".")):
 
 # ===============================================================================
 def process_log(log_fn: Path, mnk, winners):
-    print(f"Reading: {log_fn}")
+    print(f"Reading {log_fn}")
 
     content = log_fn.read_text()
     m = re_errors.search(content)

diff --git a/src/acc/libsmm_acc/tune/tune_merge.py b/src/acc/libsmm_acc/tune/tune_merge.py
@@ -14,12 +14,13 @@
 import argparse
 
 sys.path.append("../")
-from kernels.smm_acc_predict import params_dict_to_kernel  # noqa: E402
+from kernels.smm_acc import params_dict_to_kernel  # noqa: E402
 
 
 def main(param_fn):
+
     # Read new kernel parameters
-    param_new = "parameters.json"
+    param_new = "../parameters/parameters.json"
     with open(param_new) as f:
         new_kernels = [params_dict_to_kernel(**params) for params in json.load(f)]
 
@@ -70,7 +71,7 @@ def main(param_fn):
         "--params",
         metavar="parameters_GPU.json",
         type=str,
-        default="../parameters/parameters_P100.json",
+        default="../parameters/parameters_A100.json",
         help="parameter file in which to merge the newly obtained autotuned parameters",
     )
 

diff --git a/src/acc/libsmm_acc/tune/tune_setup.py b/src/acc/libsmm_acc/tune/tune_setup.py
@@ -37,6 +37,7 @@ def main(
     blocks_from_param_file,
     tune_dir: Path,
 ):
+
     # Read existing parameters
     assert (
         param_fn.name in gpu_architectures.keys()
@@ -71,7 +72,7 @@ def main(
         triples = combinations(*blocksizes)
     print(f"Requested to autotune {len(triples)} triplets")
 
-    for m, n, k in triples:
+    for (m, n, k) in triples:
         existing = [kern for kern in autotuned_kernels if kern.can_handle(m, n, k)]
         if existing:
             print(
@@ -252,7 +253,8 @@ def gen_benchmark(outdir, gpu_properties, autotuning_properties, compiler, m, n,
 
 
 # ===============================================================================
-def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
+def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task, max_num_nodes=0):
+
     file_extension = get_file_extension_from_compiler(compiler)
 
     tprefix = f"tune_{int(m)}x{int(n)}x{int(k)}"
@@ -262,10 +264,7 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
         num_nodes = min(len(all_exe), max_num_nodes)
     else:
         num_nodes = len(all_exe)
-    if num_nodes < 3:
-        time = "4:00:00"
-    else:
-        time = "0:30:00"
+    time = "00:40:00"
 
     output = f"""\
 #!/bin/bash -l
@@ -275,30 +274,26 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task={int(cpus_per_task)}
 #SBATCH --time={time}
-#SBATCH --partition=bardpeak
-
-source ${{MODULESHOME}}/init/sh; module use /global/opt/modulefiles;
-module unload PrgEnv-cray
-module load PrgEnv-gnu
-"""
-    if compiler == "nvcc":
-        output += "module load cudatoolkit/8.0.61_2.4.9-6.0.7.0_17.1__g899857c\n"
-    else:  # i.e. compiler = hipcc
-        output += "module load rocm/5.1.0; module load craype-accel-amd-gfx90a;\n"
-
-    output += """\
-export ROCR_VISIBLE_DEVICES=4 # GPU corresponding to Numa node 0
+#SBATCH --account=jiek61
+#SBATCH --partition=dc-gpu
+#SBATCH --cuda-mps
+#SBATCH --gres=gpu:4
+echo "Starting batch job $SLURM_JOB_NAME with the job id $SLURM_JOB_ID ($SLURM_JOB_ACCOUNT) on cluster $SLURM_CLUSTER_NAME at $(date)"
+echo "$SLURM_NTASKS tasks with $SLURM_NTASKS_PER_NODE tasks per node, $SLURM_NTASKS_PER_CORE tasks per core, and $SLURM_CPUS_PER_TASK threads per task have been allocated on $SLURM_JOB_NUM_NODES nodes"
+echo "Node list: $SLURM_JOB_NODELIST"
+module purge
+module add GCC/11.3.0
+module add ParaStationMPI/5.8.0-1-mt
+module add CUDA/11.7
 module list
-cd $SLURM_SUBMIT_DIR
-
-date
+nvidia-smi
+t1=$(date +%s)
 """
-
     # Compilation
     num_nodes_busy = 0
     for exe in all_exe:
         output += (
-            f"srun --nodes=1 --bcast=/tmp/${{USER}} --ntasks=1 --ntasks-per-node=1"
+            f"srun --nodes=1 --ntasks=1 --ntasks-per-node=1"
             f" --cpus-per-task={cpus_per_task} --exact make -j {cpus_per_task} {exe} &\n"
         )
         num_nodes_busy += 1
@@ -307,13 +302,14 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
             num_nodes_busy = 0
 
     output += "wait\n"
-    output += "date\n"
-    output += "\n"
+    output += "t2=$(date +%s)\n"
+    output += "echo $((t2-t1)) seconds for compilation step\n\n"
 
     # Execution
+    output += "t1=$(date +%s)\n"
     for exe in all_exe:
         output += (
-            f"srun --nodes=1 --bcast=/tmp/${{USER}} --ntasks=1 --ntasks-per-node=1"
+            f"srun --nodes=1 --ntasks=1 --ntasks-per-node=1"
             f" --cpus-per-task=1 --exact ./{exe} > {exe}.log 2>&1 & \n"
         )
         num_nodes_busy += 1
@@ -322,28 +318,34 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
             num_nodes_busy = 0
 
     output += "wait\n"
-    output += "date\n"
-    output += "\n"
+    output += "t2=$(date +%s)\n"
+    output += "echo $((t2-t1)) seconds for execution step\n\n"
 
     # Winner
     output += "echo Over all winner:\n"
-    output += f"grep WINNER {tprefix}_exe*.log  |  sort -n --field-separator='#' -k 2 | tail -n 1\n"
-    output += "\n"
-    output += "#EOF\n"
+    output += f"grep WINNER {tprefix}_exe*.log | sort -n --field-separator='#' -k 2 | tail -n 1\n\n"
+
+    # Cleaning
+    output += "make realclean\n"
 
     fn = outdir / f"{tprefix}.job"
     writefile(fn, output)
 
 
 # ===============================================================================
 def gen_makefile(outdir, compiler, arch):
+
     file_extension = get_file_extension_from_compiler(compiler)
 
     # header
     output = ".SECONDARY:\n"
     output += f"vpath %{file_extension}../\n\n"
-    output += ".PHONY: do_nothing build_all \n\n"
+    output += ".PHONY: do_nothing build_all clean realclean\n\n"
     output += "do_nothing:\n\n"
+    output += "clean:\n"
+    output += "	rm -f *.o\n\n"
+    output += "realclean: clean\n"
+    output += "	rm -f *.cu\n\n"
 
     # target "build_all"
     all_exe_src = sorted(
@@ -461,15 +463,15 @@ def combinations(*sizes):
         "-c",
         "--cpus_per_task",
         metavar="INT",
-        default=12,
+        default=128,
         type=int,
         help="Number of CPUs required per task",
     )
     parser.add_argument(
         "-n",
         "--nodes",
         metavar="INT",
-        default=0,
+        default=1,
         type=int,
         help="Maximum number of nodes an slurm allocation can get. 0: not a limiting factor"
         + "(choose this option if you can allocate jobs of 20-30 nodes without a problem.",