diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7e46716cb3..67f50d2f31 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -3,8 +3,6 @@ # Please see the documentation for all configuration options: # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates -# For Devito-xdsl fork - version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values diff --git a/devito/core/__init__.py b/devito/core/__init__.py index 92b4a1c025..2fae24864f 100644 --- a/devito/core/__init__.py +++ b/devito/core/__init__.py @@ -4,7 +4,6 @@ Cpu64FsgCOperator, Cpu64FsgOmpOperator, Cpu64CustomOperator) -from devito.core.cpu_xdsl import XdslnoopOperator, XdslAdvOperator from devito.core.intel import (Intel64AdvCOperator, Intel64AdvOmpOperator, Intel64FsgCOperator, Intel64FsgOmpOperator) from devito.core.arm import ArmAdvCOperator, ArmAdvOmpOperator @@ -12,10 +11,13 @@ from devito.core.gpu import (DeviceNoopOmpOperator, DeviceNoopAccOperator, DeviceAdvOmpOperator, DeviceAdvAccOperator, DeviceFsgOmpOperator, DeviceFsgAccOperator, - DeviceCustomOmpOperator, DeviceCustomAccOperator, - XdslAdvDeviceOperator) + DeviceCustomOmpOperator, DeviceCustomAccOperator) from devito.operator.registry import operator_registry +# Import XDSL Operators +from devito.xdsl_core.xdsl_cpu import XdslnoopOperator, XdslAdvOperator +from devito.xdsl_core.xdsl_gpu import XdslAdvDeviceOperator + # Register CPU Operators operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'C') operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'openmp') diff --git a/devito/core/cpu.py b/devito/core/cpu.py index ecf6dce42f..bca6e3e13d 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -4,13 +4,12 @@ from devito.exceptions import InvalidOperator from devito.passes.equations import collect_derivatives from devito.passes.clusters import (Lift, blocking, buffering, cire, cse, - factorize, fission, fuse, optimize_pows, - optimize_hyperplanes) -from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, linearize, mpiize, - hoist_prodders, relax_incr_dimensions) + factorize, fission, fuse, optimize_hyperplanes, + optimize_pows) +from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, hoist_prodders, + linearize, mpiize, relax_incr_dimensions) from devito.tools import timed_pass - __all__ = ['Cpu64NoopCOperator', 'Cpu64NoopOmpOperator', 'Cpu64AdvCOperator', 'Cpu64AdvOmpOperator', 'Cpu64FsgCOperator', 'Cpu64FsgOmpOperator', 'Cpu64CustomOperator'] diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 1570d1ac97..6742aef69f 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -1,17 +1,9 @@ -from contextlib import redirect_stdout -import io -import os -import sys from functools import partial -from io import StringIO import numpy as np -from devito.arch.archinfo import get_nvidia_cc -from devito.core.operator import CoreOperator, CustomOperator, ParTile -from devito.core.cpu_xdsl import (XdslAdvOperator, generate_mlir_pipeline, - generate_pipeline) +from devito.core.operator import CoreOperator, CustomOperator, ParTile from devito.exceptions import InvalidOperator from devito.operator.operator import rcompile @@ -22,15 +14,9 @@ optimize_pows) from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, hoist_prodders, linearize, pthreadify, relax_incr_dimensions) -from devito.logger import info, perf -from devito.mpi import MPI from devito.tools import as_tuple, timed_pass -from xdsl.printer import Printer -from xdsl.xdsl_opt_main import xDSLOptMain - -from devito.ir.ietxdsl.cluster_to_ssa import finalize_module_with_globals __all__ = ['DeviceNoopOperator', 'DeviceAdvOperator', 'DeviceCustomOperator', 'DeviceNoopOmpOperator', 'DeviceAdvOmpOperator', 'DeviceFsgOmpOperator', @@ -376,122 +362,6 @@ def _make_iet_passes_mapper(cls, **kwargs): return mapper -class XdslAdvDeviceOperator(XdslAdvOperator): - - _Target = DeviceOmpTarget - - def _jit_compile(self): - """ - JIT-compile the C code generated by the Operator. - It is ensured that JIT compilation will only be performed - once per Operator, reagardless of how many times this method - is invoked. - """ - with self._profiler.timer_on('jit-compile'): - is_mpi = MPI.Is_initialized() - is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX' - - if is_mpi and is_gpu: - raise RuntimeError("Cannot run MPI+GPU for now!") - - # specialize the code for the specific apply parameters - finalize_module_with_globals(self._module, self._jit_kernel_constants, - gpu_boilerplate=is_gpu) - - # print module as IR - module_str = StringIO() - Printer(stream=module_str).print(self._module) - module_str = module_str.getvalue() - - xdsl_pipeline = generate_XDSL_GPU_PIPELINE() - # Get GPU blocking shapes - block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])] # noqa - block_sizes = ','.join(str(bs) for bs in block_sizes) - mlir_pipeline = generate_MLIR_GPU_PIPELINE(block_sizes) - - # allow jit backdooring to provide your own xdsl code - backdoor = os.getenv('XDSL_JIT_BACKDOOR') - if backdoor is not None: - if os.path.splitext(backdoor)[1] == ".so": - info(f"JIT Backdoor: skipping compilation and using {backdoor}") - self._tf.name = backdoor - return - print("JIT Backdoor: loading xdsl file from: " + backdoor) - with open(backdoor, 'r') as f: - module_str = f.read() - - # Uncomment to print the module_str - # Printer().print(module_str) - source_name = os.path.splitext(self._tf.name)[0] + ".mlir" - source_file = open(source_name, "w") - source_file.write(module_str) - source_file.close() - - # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang - cflags = "-O3 -march=native -mtune=native -lmlir_c_runner_utils" - - try: - cc = "clang" - - cflags += " -lmlir_cuda_runtime " - cflags += " -shared " - - # TODO More detailed error handling manually, - # instead of relying on a bash-only feature. - - # xdsl-opt, get xDSL IR - # TODO: Remove quotes in pipeline; currently workaround with [1:-1] - # Run the first pipeline, mostly xDSL-centric - xdsl_args = [source_name, - "--allow-unregistered-dialect", - "-p", - xdsl_pipeline[1:-1],] - # We use the Python API to run xDSL rather than a subprocess - # This avoids reimport overhead - xdsl = xDSLOptMain(args=xdsl_args) - out = io.StringIO() - perf("-----------------") - perf(f"xdsl-opt {' '.join(xdsl_args)}") - with redirect_stdout(out): - xdsl.run() - - # To use as input in the next stage - out.seek(0) - # Run the second pipeline, mostly MLIR-centric - xdsl_mlir_args = ["--allow-unregistered-dialect", - "-p", - mlir_pipeline] - # We drive it though xDSL rather than a mlir-opt call for: - # - ability to use xDSL replacement passes in the middle - # - Avoiding complex process cmanagement code here: xDSL provides - xdsl = xDSLOptMain(args=xdsl_mlir_args) - out2 = io.StringIO() - perf("-----------------") - perf(f"xdsl-opt {' '.join(xdsl_mlir_args)}") - with redirect_stdout(out2): - old_stdin = sys.stdin - sys.stdin = out - xdsl.run() - sys.stdin = old_stdin - - # mlir-translate to translate to LLVM-IR - mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir' - out = self.compile(mlir_translate_cmd, out2.getvalue()) - - # Compile with clang and get LLVM-IR - clang_cmd = f'{cc} {cflags} -o {self._tf.name} {self._interop_tf.name} -xir -' # noqa - out = self.compile(clang_cmd, out) - - except Exception as ex: - print("error") - raise ex - - elapsed = self._profiler.py_timers['jit-compile'] - - perf("XDSLAdvDeviceOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" % - (self.name, source_name, elapsed)) - - # OpenACC class DeviceAccOperatorMixin(object): @@ -569,57 +439,3 @@ def reads_if_on_host(c): return set() return runs_on_host, reads_if_on_host - - -def generate_XDSL_GPU_PIPELINE(): - passes = [ - "stencil-shape-inference", - "convert-stencil-to-ll-mlir", - "reconcile-unrealized-casts", - "printf-to-llvm", - "canonicalize" - ] - - return generate_pipeline(passes) - - -# gpu-launch-sink-index-computations seemed to have no impact -def generate_MLIR_GPU_PIPELINE(block_sizes): - return generate_pipeline([ - generate_mlir_pipeline([ - "test-math-algebraic-simplification", - f"scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}}", - ]), - "gpu-map-parallel-loops", - generate_mlir_pipeline([ - "convert-parallel-loops-to-gpu", - "lower-affine", - "canonicalize", - "cse", - "fold-memref-alias-ops", - "gpu-launch-sink-index-computations", - "gpu-kernel-outlining", - "canonicalize{region-simplify}", - "cse", - "fold-memref-alias-ops", - "expand-strided-metadata", - "lower-affine", - "canonicalize", - "cse", - "func.func(gpu-async-region)", - "canonicalize", - "cse", - "convert-arith-to-llvm{index-bitwidth=64}", - "convert-scf-to-cf", - "convert-cf-to-llvm{index-bitwidth=64}", - "canonicalize", - "cse", - "convert-func-to-llvm{use-bare-ptr-memref-call-conv}", - f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}", - "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)", - "gpu-to-llvm", - "gpu-module-to-binary", - "canonicalize", - "cse" - ]), - ])[1:-1] diff --git a/devito/ir/ietxdsl/__init__.py b/devito/ir/ietxdsl/__init__.py deleted file mode 100644 index dac6c2b1a8..0000000000 --- a/devito/ir/ietxdsl/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from devito.ir.ietxdsl.cluster_to_ssa import finalize_module_with_globals - -from devito.ir.ietxdsl.profiling import apply_timers - -# flake8: noqa diff --git a/devito/ir/xdsl_iet/__init__.py b/devito/ir/xdsl_iet/__init__.py new file mode 100644 index 0000000000..d7005bb6d1 --- /dev/null +++ b/devito/ir/xdsl_iet/__init__.py @@ -0,0 +1,5 @@ +from devito.ir.xdsl_iet.cluster_to_ssa import finalize_module_with_globals + +from devito.ir.xdsl_iet.profiling import apply_timers + +# flake8: noqa diff --git a/devito/ir/ietxdsl/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py similarity index 84% rename from devito/ir/ietxdsl/cluster_to_ssa.py rename to devito/ir/xdsl_iet/cluster_to_ssa.py index c8c1cbe26b..a28b13dd85 100644 --- a/devito/ir/ietxdsl/cluster_to_ssa.py +++ b/devito/ir/xdsl_iet/cluster_to_ssa.py @@ -1,14 +1,14 @@ from functools import reduce -import numpy as np # ------------- General imports -------------# from typing import Any, Iterable from dataclasses import dataclass, field -from sympy import Add, And, Expr, Float, GreaterThan, Indexed, Integer, LessThan, Mod, Mul, Number, Pow, StrictGreaterThan, StrictLessThan, Symbol, floor +from sympy import (Add, And, Expr, Float, GreaterThan, Indexed, Integer, LessThan, + Number, Pow, StrictGreaterThan, StrictLessThan, Symbol, floor, + Mul) from sympy.core.relational import Relational from sympy.logic.boolalg import BooleanFunction -from devito.ir.equations.equation import OpInc from devito.operations.interpolators import Injection from devito.operator.operator import Operator from devito.symbolics.search import retrieve_dimensions, retrieve_functions @@ -21,8 +21,9 @@ from devito.types.equation import Eq # ------------- xdsl imports -------------# -from xdsl.dialects import (arith, builtin, func, memref, scf, - stencil, gpu) +from xdsl.dialects import arith, func, memref, scf, stencil, gpu, builtin +from xdsl.dialects.builtin import (ModuleOp, UnrealizedConversionCastOp, StringAttr, + IndexType) from xdsl.dialects.experimental import math from xdsl.ir import Block, Operation, OpResult, Region, SSAValue from xdsl.pattern_rewriter import ( @@ -30,8 +31,7 @@ PatternRewriter, PatternRewriteWalker, RewritePattern, - op_type_rewrite_pattern, - InsertPoint + op_type_rewrite_pattern ) from xdsl.builder import ImplicitBuilder from xdsl.transforms.experimental.convert_stencil_to_ll_mlir import StencilToMemRefType @@ -40,21 +40,13 @@ from devito import Grid, SteppingDimension from devito.ir.equations import LoweredEq from devito.symbolics import retrieve_function_carriers -from devito.tools.data_structures import OrderedSet -from devito.types.dense import DiscreteFunction, Function, TimeFunction -from devito.types.equation import Eq -from devito.types.mlir_types import dtype_to_xdsltype +from devito.types.mlir_types import dtype_to_xdsltype, ptr_of, f32 # ------------- devito-xdsl SSA imports -------------# -from devito.ir.ietxdsl import iet_ssa -from devito.ir.ietxdsl.utils import is_int, is_float, dtypes_to_xdsltypes -from devito.types.mlir_types import f32, ptr_of - - -from examples.seismic.source import PointSource -from tests.test_interpolation import points -from tests.test_timestepping import d +from devito.ir.xdsl_iet import iet_ssa +from devito.ir.xdsl_iet.utils import is_int, is_float, dtypes_to_xdsltypes +from examples.seismic import PointSource # flake8: noqa @@ -91,6 +83,7 @@ def setup_memref_args(functions): return args + class ExtractDevitoStencilConversion: """ Lower Devito equations to the stencil dialect @@ -103,12 +96,6 @@ class ExtractDevitoStencilConversion: symbol_values: dict[str, SSAValue] time_offs: int - def __init__(self): - self.temps = dict() - self.symbol_values = dict() - - time_offs: int - def __init__(self, operator: type[Operator]): self.temps = dict() self.operator = operator @@ -144,13 +131,6 @@ def convert_symbol_eq(self, symbol: Symbol, rhs: LoweredEq, **kwargs): self.symbol_values[symbol.name] = self._visit_math_nodes(None, rhs, None) self.symbol_values[symbol.name].name_hint = symbol.name - def convert_symbol_eq(self, symbol: Symbol, rhs: LoweredEq, **kwargs): - """ - Convert a symbol equation to xDSL. - """ - self.symbol_values[symbol.name] = self._visit_math_nodes(None, rhs, None) - self.symbol_values[symbol.name].name_hint = symbol.name - def _convert_eq(self, eq: LoweredEq, **kwargs): """ # Docs here Need rewriting @@ -215,25 +195,27 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr, # If we are in a stencil (encoded by having the output_indexed passed), we # compute the relative space offsets and make it a stencil offset if output_indexed is not None: - space_offsets = [node.indices[d] - output_indexed.indices[d] for d in node.function.space_dimensions] + space_offsets = ([node.indices[d] - output_indexed.indices[d] + for d in node.function.space_dimensions]) temp = self.function_values[(node.function, time_offset)] access = stencil.AccessOp.get(temp, space_offsets) return access.res # Otherwise, generate a load op else: temp = self.function_values[(node.function, time_offset)] - memtemp = builtin.UnrealizedConversionCastOp.get(temp, StencilToMemRefType(temp.type)).results[0] + memreftype = StencilToMemRefType(temp.type) + memtemp = UnrealizedConversionCastOp.get(temp, memreftype).results[0] memtemp.name_hint = temp.name_hint + "_mem" indices = node.indices if isinstance(node.function, TimeFunction): indices = indices[1:] - ssa_indices = [self._visit_math_nodes(dim, i, output_indexed) for i in node.indices] + ssa_indices = ([self._visit_math_nodes(dim, i, output_indexed) + for i in node.indices]) for i, ssa_i in enumerate(ssa_indices): if isinstance(ssa_i.type, builtin.IntegerType): - ssa_indices[i] = arith.IndexCastOp(ssa_i, builtin.IndexType()) + ssa_indices[i] = arith.IndexCastOp(ssa_i, IndexType()) return memref.Load.get(memtemp, ssa_indices).res - import pdb; pdb.set_trace() # Handle Integers elif isinstance(node, Integer): cst = arith.Constant.from_int_and_width(int(node), builtin.i64) @@ -294,13 +276,16 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr, # Handle Mod elif isinstance(node, INT): assert len(node.args) == 1, "Expected single argument for integer cast." - return arith.FPToSIOp(self._visit_math_nodes(dim, node.args[0], output_indexed), builtin.i64).result + return arith.FPToSIOp(self._visit_math_nodes(dim, node.args[0], + output_indexed), builtin.i64).result elif isinstance(node, floor): assert len(node.args) == 1, "Expected single argument for floor." - return math.FloorOp(self._visit_math_nodes(dim, node.args[0], output_indexed)).result + op = self._visit_math_nodes(dim, node.args[0], output_indexed) + return math.FloorOp(op).result elif isinstance(node, And): - SSAargs = (self._visit_math_nodes(dim, arg, output_indexed) for arg in node.args) - return reduce(lambda x,y : arith.AndI(x,y).result, SSAargs) + SSAargs = (self._visit_math_nodes(dim, arg, output_indexed) + for arg in node.args) + return reduce(lambda x, y : arith.AndI(x, y).result, SSAargs) elif isinstance(node, Relational): if isinstance(node, GreaterThan): mnemonic = "sge" @@ -325,7 +310,7 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr, else: raise NotImplementedError(f"Unknown math:{type(node)} {node}", node) - def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None: + def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None: """ Builds the body of the step function for a given dimension and equation. @@ -345,7 +330,6 @@ def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None: elif isinstance(f.function, Function): time_offset = 0 else: - import pdb;pdb.set_trace() raise NotImplementedError(f"reading function of type {type(f.function)} not supported") read_functions.add((f.function, time_offset)) @@ -375,7 +359,7 @@ def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None: assert "temp" in apply_op.name_hint apply_arg.name_hint = apply_op.name_hint.replace("temp", "blk") - self.apply_temps = {k:v for k,v in zip(read_functions, apply.region.block.args)} + self.apply_temps = {k: v for k, v in zip(read_functions, apply.region.block.args)} # Update the function values with the new temps self.function_values |= self.apply_temps @@ -400,7 +384,7 @@ def build_generic_step_expression(self, dim: SteppingDimension, eq: LoweredEq): # Sources value = self._visit_math_nodes(dim, eq.rhs, None) temp = self.function_values[self.out_time_buffer] - memtemp = builtin.UnrealizedConversionCastOp.get([temp], [StencilToMemRefType(temp.type)]).results[0] + memtemp = UnrealizedConversionCastOp.get([temp], [StencilToMemRefType(temp.type)]).results[0] memtemp.name_hint = temp.name_hint + "_mem" indices = eq.lhs.indices if isinstance(eq.lhs.function, TimeFunction): @@ -408,13 +392,17 @@ def build_generic_step_expression(self, dim: SteppingDimension, eq: LoweredEq): ssa_indices = [self._visit_math_nodes(dim, i, None) for i in indices] for i, ssa_i in enumerate(ssa_indices): if isinstance(ssa_i.type, builtin.IntegerType): - ssa_indices[i] = arith.IndexCastOp(ssa_i, builtin.IndexType()) + ssa_indices[i] = arith.IndexCastOp(ssa_i, IndexType()) match eq.operation: case None: memref.Store.get(value, memtemp, ssa_indices) - case OpInc: - memref.AtomicRMWOp(operands=[value, memtemp, ssa_indices], result_types=[value.type], properties={"kind" : builtin.IntegerAttr(0, builtin.i64)}) + case OpInc: # noqa + # Maybe rename + attr = builtin.IntegerAttr(0, builtin.i64) + memref.AtomicRMWOp(operands=[value, memtemp, ssa_indices], + result_types=[value.type], + properties={"kind": attr}) def build_condition(self, dim: SteppingDimension, eq: BooleanFunction): return self._visit_math_nodes(dim, eq, None) @@ -436,12 +424,12 @@ def build_time_loop( ): # Bounds and step boilerpalte lb = iet_ssa.LoadSymbolic.get( - step_dim.symbolic_min._C_name, builtin.IndexType() + step_dim.symbolic_min._C_name, IndexType() ) ub = iet_ssa.LoadSymbolic.get( - step_dim.symbolic_max._C_name, builtin.IndexType() + step_dim.symbolic_max._C_name, IndexType() ) - one = arith.Constant.from_int_and_width(1, builtin.IndexType()) + one = arith.Constant.from_int_and_width(1, IndexType()) # Devito iterates from time_m to time_M *inclusive*, MLIR only takes # exclusive upper bounds, so we increment here. ub = arith.Addi(ub, one) @@ -449,7 +437,7 @@ def build_time_loop( # Take the exact time_step from Devito try: step = arith.Constant.from_int_and_width( - int(step_dim.symbolic_incr), builtin.IndexType() + int(step_dim.symbolic_incr), IndexType() ) step.result.name_hint = "step" @@ -469,7 +457,7 @@ def build_time_loop( ub, step, iter_args, - Block(arg_types=[builtin.IndexType(), *(a.type for a in iter_args)]), + Block(arg_types=[IndexType(), *(a.type for a in iter_args)]), ) # Name the 'time' step iterator @@ -526,30 +514,45 @@ def _lower_injection(self, eqs: list[LoweredEq]): for interval in ispace[1:]: lower = interval.symbolic_min if isinstance(lower, Scalar): - lb = iet_ssa.LoadSymbolic.get(lower._C_name, builtin.IndexType()) + lb = iet_ssa.LoadSymbolic.get(lower._C_name, IndexType()) elif isinstance(lower, (Number, int)): - lb = arith.Constant.from_int_and_width(int(lower), builtin.IndexType()) + lb = arith.Constant.from_int_and_width(int(lower), IndexType()) else: raise NotImplementedError(f"Lower bound of type {type(lower)} not supported") - lb.result.name_hint = f"{interval.dim.name}_m" + + try: + name = interval.dim.symbolic_min.name + except: + assert interval.dim.symbolic_min.is_integer + name = f"{interval.dim.name}_M" + + lb.result.name_hint = name upper = interval.symbolic_max if isinstance(upper, Scalar): - ub = iet_ssa.LoadSymbolic.get(upper._C_name, builtin.IndexType()) + ub = iet_ssa.LoadSymbolic.get(upper._C_name, IndexType()) elif isinstance(upper, (Number, int)): - ub = arith.Constant.from_int_and_width(int(upper), builtin.IndexType()) + ub = arith.Constant.from_int_and_width(int(upper), IndexType()) else: raise NotImplementedError( f"Upper bound of type {type(upper)} not supported" ) - ub.result.name_hint = f"{interval.dim.name}_M" + + try: + name = interval.dim.symbolic_max.name + except: + assert interval.dim.symbolic_max.is_integer + name = f"{interval.dim.name}_M" + + ub.result.name_hint = name + lbs.append(lb) ubs.append(ub) - steps = [arith.Constant.from_int_and_width(1, builtin.IndexType()).result]*len(ubs) + steps = [arith.Constant.from_int_and_width(1, IndexType()).result]*len(ubs) ubs = [arith.Addi(ub, steps[0]) for ub in ubs] - with ImplicitBuilder(scf.ParallelOp(lbs, ubs, steps, [pblock := Block(arg_types=[builtin.IndexType()]*len(ubs))]).body): + with ImplicitBuilder(scf.ParallelOp(lbs, ubs, steps, [pblock := Block(arg_types=[IndexType()]*len(ubs))]).body): for arg, interval in zip(pblock.args, ispace[1:], strict=True): arg.name_hint = interval.dim.name self.symbol_values[interval.dim.name] = arg @@ -558,7 +561,7 @@ def _lower_injection(self, eqs: list[LoweredEq]): scf.Yield() # raise NotImplementedError("Injections not supported yet") - def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: + def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp: """ This converts a Devito Operator, represented here by a list of LoweredEqs, to an xDSL module defining a function implementing it. @@ -575,7 +578,8 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: their time sizes. Their sizes are deduced from the Grid. 2. Create a time iteration loop, swapping buffers to implement time buffering. - NB: This needs to be converted to a Cluster conversion soon, which will be more sound. + NB: This needs to be converted to a Cluster conversion soon, + which will be more sound. ```mlir func.func @apply_kernel(%u_vec_0 : !stencil.field<[-1,4]xf32>, %u_vec_1 : !stencil.field<[-1,4]xf32>) { @@ -598,10 +602,12 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: Those represents runtime values not yet known that will be JIT-compiled when calling the operator. """ + # Instantiate the module. - self.function_values : dict[tuple[Function, int], SSAValue] = {} - self.symbol_values : dict[str, SSAValue] = {} - module = builtin.ModuleOp(Region([block := Block([])])) + self.function_values: dict[tuple[Function, int], SSAValue] = {} + self.symbol_values: dict[str, SSAValue] = {} + + module = ModuleOp(Region([block := Block([])])) with ImplicitBuilder(block): # Get all functions used in the equations functions = OrderedSet() @@ -624,8 +630,8 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: else: raise NotImplementedError(f"Expression {eq} of type {type(eq)} not supported") - self.time_buffers : list[TimeFunction] = [] - self.functions : list[Function] = [] + self.time_buffers: list[TimeFunction] = [] + self.functions: list[Function] = [] for f in functions: match f: case TimeFunction(): @@ -667,12 +673,13 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: with ImplicitBuilder(xdsl_func.body.block): # Get the stepping dimension, if there is any in the whole input - time_functions = [f for (f,_) in self.time_buffers] + time_functions = [f for (f, _) in self.time_buffers] dimensions = { d for f in (self.functions + time_functions) for d in f.dimensions } - step_dim = next((d for d in dimensions if isinstance(d, SteppingDimension)), None) + step_dim = next((d for d in dimensions if + isinstance(d, SteppingDimension)), None) if step_dim is not None: self.build_time_loop(eqs, step_dim, **kwargs) else: @@ -686,7 +693,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp: def _ensure_same_type(self, *vals: SSAValue): if all(isinstance(val.type, builtin.IntegerAttr) for val in vals): return vals - if all(isinstance(val.type, builtin.IndexType) for val in vals): + if all(isinstance(val.type, IndexType) for val in vals): # Sources return vals if all(is_float(val) for val in vals): @@ -700,7 +707,7 @@ def _ensure_same_type(self, *vals: SSAValue): if cast_to_floats and is_float(val): processed.append(val) continue - if (not cast_to_floats) and isinstance(val.type, builtin.IndexType): + if (not cast_to_floats) and isinstance(val.type, IndexType): processed.append(val) continue # if the val is the result of a arith.constant with no uses, @@ -716,17 +723,17 @@ def _ensure_same_type(self, *vals: SSAValue): float(val.op.value.value.data), builtin.f32 ) else: - val.type = builtin.IndexType() - val.op.value.type = builtin.IndexType() + val.type = IndexType() + val.op.value.type = IndexType() processed.append(val) continue # insert a cast op if cast_to_floats: - if val.type == builtin.IndexType(): + if val.type == IndexType(): val = arith.IndexCastOp(val, builtin.i64).result conv = arith.SIToFPOp(val, builtin.f32) else: - conv = arith.IndexCastOp(val, builtin.IndexType()) + conv = arith.IndexCastOp(val, IndexType()) processed.append(conv.result) return processed @@ -754,7 +761,7 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter): return self.done = True - op.sym_name = builtin.StringAttr("gpu_kernel") + op.sym_name = StringAttr("gpu_kernel") print("Doing GPU STUFF") # GPU STUFF wrapper = func.FuncOp(self.func_name, op.function_type, Region(Block([func.Return()], arg_types=op.function_type.inputs))) @@ -762,12 +769,12 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter): wrapper.body.block.insert_op_before(func.Call("gpu_kernel", body.args, []), body.last_op) for arg in wrapper.args: shapetype = arg.type - if isinstance(shapetype, stencil.FieldType): + if isinstance(shapetype, stencil.FieldType): memref_type = memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape()) alloc = gpu.AllocOp(memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape())) - outcast = builtin.UnrealizedConversionCastOp.get(alloc, shapetype) + outcast = UnrealizedConversionCastOp.get(alloc, shapetype) arg.replace_by(outcast.results[0]) - incast = builtin.UnrealizedConversionCastOp.get(arg, memref_type) + incast = UnrealizedConversionCastOp.get(arg, memref_type) copy = gpu.MemcpyOp(source=incast, destination=alloc) body.insert_ops_before([alloc, outcast, incast, copy], body.ops.first) @@ -777,49 +784,6 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter): rewriter.insert_op_after_matched_op(wrapper) -class TimerRewritePattern(RewritePattern): - """ - Base class for time benchmarking related rewrite patterns - """ - pass - - -@dataclass -class MakeFunctionTimed(TimerRewritePattern): - """ - Populate the section0 devito timer with the total runtime of the function - """ - func_name: str - seen_ops: set[func.Func] = field(default_factory=set) - - @op_type_rewrite_pattern - def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter): - if op.sym_name.data != self.func_name or op in self.seen_ops: - return - - # only apply once - self.seen_ops.add(op) - - # Insert timer start and end calls - rewriter.insert_op([ - t0 := func.Call('timer_start', [], [builtin.f64]) - ], InsertPoint.at_start(op.body.block)) - - ret = op.get_return_op() - assert ret is not None - - rewriter.insert_op_before([ - timers := iet_ssa.LoadSymbolic.get('timers', llvm.LLVMPointerType.opaque()), - t1 := func.Call('timer_end', [t0], [builtin.f64]), - llvm.StoreOp(t1, timers), - ], ret) - - rewriter.insert_op([ - func.FuncOp.external('timer_start', [], [builtin.f64]), - func.FuncOp.external('timer_end', [builtin.f64], [builtin.f64]), - ], InsertPoint.after(rewriter.current_operation)) - - def get_containing_func(op: Operation) -> func.FuncOp | None: while op is not None and not isinstance(op, func.FuncOp): op = op.parent_op() @@ -887,7 +851,7 @@ def match_and_rewrite(self, op: iet_ssa.LoadSymbolic, rewriter: PatternRewriter, parent.update_function_type() -def finalize_module_with_globals(module: builtin.ModuleOp, known_symbols: dict[str, Any], +def finalize_module_with_globals(module: ModuleOp, known_symbols: dict[str, Any], gpu_boilerplate): """ This function finalizes a module by replacing all symbolic constants with their diff --git a/devito/ir/ietxdsl/iet_ssa.py b/devito/ir/xdsl_iet/iet_ssa.py similarity index 100% rename from devito/ir/ietxdsl/iet_ssa.py rename to devito/ir/xdsl_iet/iet_ssa.py diff --git a/devito/ir/ietxdsl/profiling.py b/devito/ir/xdsl_iet/profiling.py similarity index 98% rename from devito/ir/ietxdsl/profiling.py rename to devito/ir/xdsl_iet/profiling.py index f767a34ff1..1b434d78e0 100644 --- a/devito/ir/ietxdsl/profiling.py +++ b/devito/ir/xdsl_iet/profiling.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field -from devito.ir.ietxdsl import iet_ssa +from devito.ir.xdsl_iet import iet_ssa from xdsl.dialects import builtin, func, llvm from xdsl.pattern_rewriter import (RewritePattern, op_type_rewrite_pattern, diff --git a/devito/ir/ietxdsl/utils.py b/devito/ir/xdsl_iet/utils.py similarity index 100% rename from devito/ir/ietxdsl/utils.py rename to devito/ir/xdsl_iet/utils.py diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 1c828a88a0..0c66f74ed8 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -271,6 +271,7 @@ def _lower(cls, expressions, **kwargs): def _rcompile_wrapper(cls, **kwargs0): def wrapper(expressions, **kwargs1): return rcompile(expressions, {**kwargs0, **kwargs1}) + return wrapper @classmethod @@ -837,7 +838,6 @@ def apply(self, **kwargs): # Invoke kernel function with args arg_values = [args[p.name] for p in self.parameters] - try: cfunction = self.cfunction with self._profiler.timer_on('apply', comm=args.comm): diff --git a/devito/types/basic.py b/devito/types/basic.py index 5f66ed1c99..102bfe3a75 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1366,7 +1366,6 @@ def free_symbols(self): class IndexedData(IndexedBase): - pass diff --git a/devito/xdsl_core/__init__.py b/devito/xdsl_core/__init__.py new file mode 100644 index 0000000000..3f0f0bffc8 --- /dev/null +++ b/devito/xdsl_core/__init__.py @@ -0,0 +1,4 @@ +from .xdsl_cpu import * +from .xdsl_gpu import * + +# flake8: noqa \ No newline at end of file diff --git a/devito/xdsl_core/utils.py b/devito/xdsl_core/utils.py new file mode 100644 index 0000000000..218e4df827 --- /dev/null +++ b/devito/xdsl_core/utils.py @@ -0,0 +1,13 @@ + +from typing import Iterable + + +def generate_pipeline(passes: Iterable[str]): + 'Generate a pipeline string from a list of passes' + passes_string = ",".join(passes) + return f'"{passes_string}"' + + +def generate_mlir_pipeline(passes: Iterable[str]): + passes_string = ",".join(passes) + return f'mlir-opt[{passes_string}]' diff --git a/devito/core/cpu_xdsl.py b/devito/xdsl_core/xdsl_cpu.py similarity index 97% rename from devito/core/cpu_xdsl.py rename to devito/xdsl_core/xdsl_cpu.py index 6c4414a63a..24e5d38ce5 100644 --- a/devito/core/cpu_xdsl.py +++ b/devito/xdsl_core/xdsl_cpu.py @@ -9,8 +9,6 @@ from io import StringIO -from typing import Iterable - from devito.core.operator import CoreOperator from devito.ir.iet import Callable, MetaCall from devito.ir.iet.nodes import Section @@ -23,13 +21,14 @@ from xdsl.printer import Printer from xdsl.xdsl_opt_main import xDSLOptMain -from devito.ir.ietxdsl.cluster_to_ssa import (ExtractDevitoStencilConversion, - finalize_module_with_globals, - setup_memref_args) # noqa +from devito.ir.xdsl_iet.cluster_to_ssa import (ExtractDevitoStencilConversion, + finalize_module_with_globals, + setup_memref_args) # noqa -from devito.ir.ietxdsl.profiling import apply_timers +from devito.ir.xdsl_iet.profiling import apply_timers from devito.passes.iet import CTarget, OmpTarget from devito.core.cpu import Cpu64OperatorMixin +from devito.xdsl_core.utils import generate_pipeline, generate_mlir_pipeline __all__ = ['XdslnoopOperator', 'XdslAdvOperator'] @@ -638,17 +637,6 @@ def generate_XDSL_MPI_PIPELINE(decomp, nb_tiled_dims): return generate_pipeline(passes) -def generate_pipeline(passes: Iterable[str]): - 'Generate a pipeline string from a list of passes' - passes_string = ",".join(passes) - return f'"{passes_string}"' - - -def generate_mlir_pipeline(passes: Iterable[str]): - passes_string = ",".join(passes) - return f'mlir-opt[{passes_string}]' - - # small interop shim script for stuff that we don't want to implement in mlir-ir _INTEROP_C = """ #include diff --git a/devito/xdsl_core/xdsl_gpu.py b/devito/xdsl_core/xdsl_gpu.py new file mode 100644 index 0000000000..87a1bc9b8a --- /dev/null +++ b/devito/xdsl_core/xdsl_gpu.py @@ -0,0 +1,192 @@ +from contextlib import redirect_stdout +import io +import os +import sys +from io import StringIO + +from devito.arch.archinfo import get_nvidia_cc + +from devito.xdsl_core.xdsl_cpu import XdslAdvOperator + +from devito.ir.xdsl_iet.cluster_to_ssa import finalize_module_with_globals +from devito.mpi import MPI + +from devito.logger import info, perf + +from xdsl.printer import Printer +from xdsl.xdsl_opt_main import xDSLOptMain +from devito.passes.iet import DeviceOmpTarget +from devito.xdsl_core.utils import generate_pipeline, generate_mlir_pipeline + + +__all__ = ['XdslAdvDeviceOperator'] + + +class XdslAdvDeviceOperator(XdslAdvOperator): + + _Target = DeviceOmpTarget + + def _jit_compile(self): + """ + JIT-compile the C code generated by the Operator. + It is ensured that JIT compilation will only be performed + once per Operator, reagardless of how many times this method + is invoked. + """ + with self._profiler.timer_on('jit-compile'): + is_mpi = MPI.Is_initialized() + is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX' + + if is_mpi and is_gpu: + raise RuntimeError("Cannot run MPI+GPU for now!") + + # specialize the code for the specific apply parameters + finalize_module_with_globals(self._module, self._jit_kernel_constants, + gpu_boilerplate=is_gpu) + + # print module as IR + module_str = StringIO() + Printer(stream=module_str).print(self._module) + module_str = module_str.getvalue() + + xdsl_pipeline = generate_XDSL_GPU_PIPELINE() + # Get GPU blocking shapes + block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])] # noqa + block_sizes = ','.join(str(bs) for bs in block_sizes) + mlir_pipeline = generate_MLIR_GPU_PIPELINE(block_sizes) + + # allow jit backdooring to provide your own xdsl code + backdoor = os.getenv('XDSL_JIT_BACKDOOR') + if backdoor is not None: + if os.path.splitext(backdoor)[1] == ".so": + info(f"JIT Backdoor: skipping compilation and using {backdoor}") + self._tf.name = backdoor + return + print("JIT Backdoor: loading xdsl file from: " + backdoor) + with open(backdoor, 'r') as f: + module_str = f.read() + + # Uncomment to print the module_str + # Printer().print(module_str) + source_name = os.path.splitext(self._tf.name)[0] + ".mlir" + source_file = open(source_name, "w") + source_file.write(module_str) + source_file.close() + + # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang + cflags = "-O3 -march=native -mtune=native -lmlir_c_runner_utils" + + try: + cc = "clang" + + cflags += " -lmlir_cuda_runtime " + cflags += " -shared " + + # TODO More detailed error handling manually, + # instead of relying on a bash-only feature. + + # xdsl-opt, get xDSL IR + # TODO: Remove quotes in pipeline; currently workaround with [1:-1] + # Run the first pipeline, mostly xDSL-centric + xdsl_args = [source_name, + "--allow-unregistered-dialect", + "-p", + xdsl_pipeline[1:-1],] + # We use the Python API to run xDSL rather than a subprocess + # This avoids reimport overhead + xdsl = xDSLOptMain(args=xdsl_args) + out = io.StringIO() + perf("-----------------") + perf(f"xdsl-opt {' '.join(xdsl_args)}") + with redirect_stdout(out): + xdsl.run() + + # To use as input in the next stage + out.seek(0) + # Run the second pipeline, mostly MLIR-centric + xdsl_mlir_args = ["--allow-unregistered-dialect", + "-p", + mlir_pipeline] + # We drive it though xDSL rather than a mlir-opt call for: + # - ability to use xDSL replacement passes in the middle + # - Avoiding complex process cmanagement code here: xDSL provides + xdsl = xDSLOptMain(args=xdsl_mlir_args) + out2 = io.StringIO() + perf("-----------------") + perf(f"xdsl-opt {' '.join(xdsl_mlir_args)}") + with redirect_stdout(out2): + old_stdin = sys.stdin + sys.stdin = out + xdsl.run() + sys.stdin = old_stdin + + # mlir-translate to translate to LLVM-IR + mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir' + out = self.compile(mlir_translate_cmd, out2.getvalue()) + + # Compile with clang and get LLVM-IR + clang_cmd = f'{cc} {cflags} -o {self._tf.name} {self._interop_tf.name} -xir -' # noqa + out = self.compile(clang_cmd, out) + + except Exception as ex: + print("error") + raise ex + + elapsed = self._profiler.py_timers['jit-compile'] + + perf("XDSLAdvDeviceOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" % + (self.name, source_name, elapsed)) + + +def generate_XDSL_GPU_PIPELINE(): + passes = [ + "stencil-shape-inference", + "convert-stencil-to-ll-mlir", + "reconcile-unrealized-casts", + "printf-to-llvm", + "canonicalize" + ] + + return generate_pipeline(passes) + + +# gpu-launch-sink-index-computations seemed to have no impact +def generate_MLIR_GPU_PIPELINE(block_sizes): + return generate_pipeline([ + generate_mlir_pipeline([ + "test-math-algebraic-simplification", + f"scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}}", + ]), + "gpu-map-parallel-loops", + generate_mlir_pipeline([ + "convert-parallel-loops-to-gpu", + "lower-affine", + "canonicalize", + "cse", + "fold-memref-alias-ops", + "gpu-launch-sink-index-computations", + "gpu-kernel-outlining", + "canonicalize{region-simplify}", + "cse", + "fold-memref-alias-ops", + "expand-strided-metadata", + "lower-affine", + "canonicalize", + "cse", + "func.func(gpu-async-region)", + "canonicalize", + "cse", + "convert-arith-to-llvm{index-bitwidth=64}", + "convert-scf-to-cf", + "convert-cf-to-llvm{index-bitwidth=64}", + "canonicalize", + "cse", + "convert-func-to-llvm{use-bare-ptr-memref-call-conv}", + f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}", + "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)", + "gpu-to-llvm", + "gpu-module-to-binary", + "canonicalize", + "cse" + ]), + ])[1:-1]