Skip to content

Commit

Permalink
Update autotuning scripts for GPU kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
mkrack committed Jun 27, 2023
1 parent f252567 commit cceb3f9
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 44 deletions.
9 changes: 3 additions & 6 deletions src/acc/libsmm_acc/tune/tune_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,10 @@ def main(tune_dir=Path(".")):
if not dir.is_dir():
continue

for exe_fpath in sorted(dir.glob("tune_*main.c*")):
mnk = tuple(int(i) for i in re_mnk.search(exe_fpath.name).groups())
for log_fpath in sorted(dir.glob("tune_*.log")):
mnk = tuple(int(i) for i in re_mnk.search(log_fpath.name).groups())
if mnk not in winners:
winners[mnk] = awinner()
log_fpath = exe_fpath.parent / exe_fpath.name.replace(
"_main.cu", ".log"
).replace("_main.cpp", ".log")
if not log_fpath.exists():
winners[mnk] = awinner(value=f"log missing: {log_fpath}", missing=1)
print(
Expand Down Expand Up @@ -100,7 +97,7 @@ def main(tune_dir=Path(".")):

# ===============================================================================
def process_log(log_fn: Path, mnk, winners):
print(f"Reading: {log_fn}")
print(f"Reading {log_fn}")

content = log_fn.read_text()
m = re_errors.search(content)
Expand Down
7 changes: 4 additions & 3 deletions src/acc/libsmm_acc/tune/tune_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
import argparse

sys.path.append("../")
from kernels.smm_acc_predict import params_dict_to_kernel # noqa: E402
from kernels.smm_acc import params_dict_to_kernel # noqa: E402


def main(param_fn):

# Read new kernel parameters
param_new = "parameters.json"
param_new = "../parameters/parameters.json"
with open(param_new) as f:
new_kernels = [params_dict_to_kernel(**params) for params in json.load(f)]

Expand Down Expand Up @@ -70,7 +71,7 @@ def main(param_fn):
"--params",
metavar="parameters_GPU.json",
type=str,
default="../parameters/parameters_P100.json",
default="../parameters/parameters_A100.json",
help="parameter file in which to merge the newly obtained autotuned parameters",
)

Expand Down
72 changes: 37 additions & 35 deletions src/acc/libsmm_acc/tune/tune_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def main(
blocks_from_param_file,
tune_dir: Path,
):

# Read existing parameters
assert (
param_fn.name in gpu_architectures.keys()
Expand Down Expand Up @@ -71,7 +72,7 @@ def main(
triples = combinations(*blocksizes)
print(f"Requested to autotune {len(triples)} triplets")

for m, n, k in triples:
for (m, n, k) in triples:
existing = [kern for kern in autotuned_kernels if kern.can_handle(m, n, k)]
if existing:
print(
Expand Down Expand Up @@ -252,7 +253,8 @@ def gen_benchmark(outdir, gpu_properties, autotuning_properties, compiler, m, n,


# ===============================================================================
def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task, max_num_nodes=0):

file_extension = get_file_extension_from_compiler(compiler)

tprefix = f"tune_{int(m)}x{int(n)}x{int(k)}"
Expand All @@ -262,10 +264,7 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
num_nodes = min(len(all_exe), max_num_nodes)
else:
num_nodes = len(all_exe)
if num_nodes < 3:
time = "4:00:00"
else:
time = "0:30:00"
time = "00:40:00"

output = f"""\
#!/bin/bash -l
Expand All @@ -275,30 +274,26 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task={int(cpus_per_task)}
#SBATCH --time={time}
#SBATCH --partition=bardpeak
source ${{MODULESHOME}}/init/sh; module use /global/opt/modulefiles;
module unload PrgEnv-cray
module load PrgEnv-gnu
"""
if compiler == "nvcc":
output += "module load cudatoolkit/8.0.61_2.4.9-6.0.7.0_17.1__g899857c\n"
else: # i.e. compiler = hipcc
output += "module load rocm/5.1.0; module load craype-accel-amd-gfx90a;\n"

output += """\
export ROCR_VISIBLE_DEVICES=4 # GPU corresponding to Numa node 0
#SBATCH --account=jiek61
#SBATCH --partition=dc-gpu
#SBATCH --cuda-mps
#SBATCH --gres=gpu:4
echo "Starting batch job $SLURM_JOB_NAME with the job id $SLURM_JOB_ID ($SLURM_JOB_ACCOUNT) on cluster $SLURM_CLUSTER_NAME at $(date)"
echo "$SLURM_NTASKS tasks with $SLURM_NTASKS_PER_NODE tasks per node, $SLURM_NTASKS_PER_CORE tasks per core, and $SLURM_CPUS_PER_TASK threads per task have been allocated on $SLURM_JOB_NUM_NODES nodes"
echo "Node list: $SLURM_JOB_NODELIST"
module purge
module add GCC/11.3.0
module add ParaStationMPI/5.8.0-1-mt
module add CUDA/11.7
module list
cd $SLURM_SUBMIT_DIR
date
nvidia-smi
t1=$(date +%s)
"""

# Compilation
num_nodes_busy = 0
for exe in all_exe:
output += (
f"srun --nodes=1 --bcast=/tmp/${{USER}} --ntasks=1 --ntasks-per-node=1"
f"srun --nodes=1 --ntasks=1 --ntasks-per-node=1"
f" --cpus-per-task={cpus_per_task} --exact make -j {cpus_per_task} {exe} &\n"
)
num_nodes_busy += 1
Expand All @@ -307,13 +302,14 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
num_nodes_busy = 0

output += "wait\n"
output += "date\n"
output += "\n"
output += "t2=$(date +%s)\n"
output += "echo $((t2-t1)) seconds for compilation step\n\n"

# Execution
output += "t1=$(date +%s)\n"
for exe in all_exe:
output += (
f"srun --nodes=1 --bcast=/tmp/${{USER}} --ntasks=1 --ntasks-per-node=1"
f"srun --nodes=1 --ntasks=1 --ntasks-per-node=1"
f" --cpus-per-task=1 --exact ./{exe} > {exe}.log 2>&1 & \n"
)
num_nodes_busy += 1
Expand All @@ -322,28 +318,34 @@ def gen_jobfile(outdir, compiler, m, n, k, cpus_per_task=12, max_num_nodes=0):
num_nodes_busy = 0

output += "wait\n"
output += "date\n"
output += "\n"
output += "t2=$(date +%s)\n"
output += "echo $((t2-t1)) seconds for execution step\n\n"

# Winner
output += "echo Over all winner:\n"
output += f"grep WINNER {tprefix}_exe*.log | sort -n --field-separator='#' -k 2 | tail -n 1\n"
output += "\n"
output += "#EOF\n"
output += f"grep WINNER {tprefix}_exe*.log | sort -n --field-separator='#' -k 2 | tail -n 1\n\n"

# Cleaning
output += "make realclean\n"

fn = outdir / f"{tprefix}.job"
writefile(fn, output)


# ===============================================================================
def gen_makefile(outdir, compiler, arch):

file_extension = get_file_extension_from_compiler(compiler)

# header
output = ".SECONDARY:\n"
output += f"vpath %{file_extension}../\n\n"
output += ".PHONY: do_nothing build_all \n\n"
output += ".PHONY: do_nothing build_all clean realclean\n\n"
output += "do_nothing:\n\n"
output += "clean:\n"
output += " rm -f *.o\n\n"
output += "realclean: clean\n"
output += " rm -f *.cu\n\n"

# target "build_all"
all_exe_src = sorted(
Expand Down Expand Up @@ -461,15 +463,15 @@ def combinations(*sizes):
"-c",
"--cpus_per_task",
metavar="INT",
default=12,
default=128,
type=int,
help="Number of CPUs required per task",
)
parser.add_argument(
"-n",
"--nodes",
metavar="INT",
default=0,
default=1,
type=int,
help="Maximum number of nodes an slurm allocation can get. 0: not a limiting factor"
+ "(choose this option if you can allocate jobs of 20-30 nodes without a problem.",
Expand Down

0 comments on commit cceb3f9

Please sign in to comment.