From b2b281949d2b58ccb85bfb77ea70f7ee0094e5ad Mon Sep 17 00:00:00 2001 From: Emilien Bauer Date: Fri, 4 Aug 2023 11:03:38 +0100 Subject: [PATCH] Add more sensible and resilient tile sizes. --- devito/operator/xdsl_operator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py index eced20c2e19..d725dd2715a 100644 --- a/devito/operator/xdsl_operator.py +++ b/devito/operator/xdsl_operator.py @@ -51,7 +51,7 @@ MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"' MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"' # gpu-launch-sink-index-computations seemed to have no impact -MLIR_GPU_PIPELINE = '"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{parallel-loop-tile-sizes=128,1,1},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,gpu-kernel-outlining,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{index-bitwidth=64},finalize-memref-to-llvm{index-bitwidth=64},convert-scf-to-cf,convert-cf-to-llvm{index-bitwidth=64},canonicalize,cse,convert-func-to-llvm{use-bare-ptr-memref-call-conv},gpu.module(convert-gpu-to-nvvm,reconcile-unrealized-casts,canonicalize,gpu-to-cubin),gpu-to-llvm,canonicalize,cse)"' +MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,gpu-kernel-outlining,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},finalize-memref-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},gpu.module(convert-gpu-to-nvvm,reconcile-unrealized-casts,canonicalize,gpu-to-cubin),gpu-to-llvm,canonicalize,cse)"' XDSL_CPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir,printf-to-llvm" XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm" @@ -120,6 +120,9 @@ def _jit_compile(self): xdsl_pipeline = XDSL_CPU_PIPELINE mlir_pipeline = MLIR_CPU_PIPELINE + block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])] + block_sizes = ','.join(str(bs) for bs in block_sizes) + if is_omp: mlir_pipeline = MLIR_OPENMP_PIPELINE @@ -132,7 +135,7 @@ def _jit_compile(self): xdsl_pipeline = XDSL_MPI_PIPELINE(decomp) elif is_gpu: xdsl_pipeline = XDSL_GPU_PIPELINE - mlir_pipeline = MLIR_GPU_PIPELINE + mlir_pipeline = MLIR_GPU_PIPELINE(block_sizes) # allow jit backdooring to provide your own xdsl code backdoor = os.getenv('XDSL_JIT_BACKDOOR')