xdslproject · AntonLydike · Oct 3, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023
diff --git a/devito/ir/ietxdsl/cluster_to_ssa.py b/devito/ir/ietxdsl/cluster_to_ssa.py
@@ -1,7 +1,9 @@
 # ------------- devito import -------------#
 
 from sympy import Add, Expr, Float, Indexed, Integer, Mod, Mul, Pow, Symbol
-from xdsl.dialects import arith, builtin, func, memref, scf, stencil
+from devito.arch.archinfo import NvidiaDevice
+from devito.parameters import configuration
+from xdsl.dialects import arith, builtin, func, memref, scf, stencil, gpu
 from xdsl.dialects.experimental import dmp, math
 from xdsl.ir import Attribute, Block, Operation, OpResult, Region, SSAValue
 from typing import Any
@@ -113,7 +115,7 @@ def _convert_eq(self, eq: LoweredEq):
         ), f"can only write to offset [0,0,0], given {offsets[1:]}"
 
         self.block.add_op(stencil.ReturnOp.get([rhs_result]))
-        outermost_block.add_op(func.Return.get())
+        outermost_block.add_op(func.Return())
 
         return func.FuncOp.from_region(
             "apply_kernel", [], [], Region([outermost_block])
@@ -272,7 +274,7 @@ def _ensure_same_type(self, *vals: SSAValue):
                 new_vals.append(val)
                 continue
             # insert an integer to float cast op
-            conv = arith.SIToFPOp.get(val, builtin.f32)
+            conv = arith.SIToFPOp(val, builtin.f32)
             self.block.add_op(conv)
             new_vals.append(conv.result)
         return new_vals
@@ -323,6 +325,38 @@ def is_float(val: SSAValue):
 
 from xdsl.dialects import llvm
 
+@dataclass
+class WrapFunctionWithTransfers(RewritePattern):
+    func_name: str
+    done: bool = field(default=False)
+
+    @op_type_rewrite_pattern
+    def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
+        if op.sym_name.data != self.func_name or self.done:
+            return
+        self.done = True
+
+        op.sym_name = builtin.StringAttr("gpu_kernel")
+        print("Doing GPU STUFF")
+        # GPU STUFF
+        wrapper = func.FuncOp(self.func_name, op.function_type, Region(Block([func.Return()], arg_types=op.function_type.inputs)))
+        body = wrapper.body.block
+        wrapper.body.block.insert_op_before(func.Call("gpu_kernel", body.args, []), body.last_op)
+        for arg in wrapper.args:
+            shapetype = arg.type
+            if isinstance(shapetype, stencil.FieldType):
+                memref_type = memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape())
+                alloc = gpu.AllocOp(memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape()))
+                outcast = builtin.UnrealizedConversionCastOp.get(alloc, shapetype)
+                arg.replace_by(outcast.results[0])
+                incast = builtin.UnrealizedConversionCastOp.get(arg, memref_type)
+                copy = gpu.MemcpyOp(source=incast, destination=alloc)
+                body.insert_ops_before([alloc, outcast, incast, copy], body.ops.first)
+
+                copy_out = gpu.MemcpyOp(source=alloc, destination=incast)
+                dealloc = gpu.DeallocOp(alloc)
+                body.insert_ops_before([copy_out, dealloc], body.ops.last)
+        rewriter.insert_op_after_matched_op(wrapper)
 @dataclass
 class MakeFunctionTimed(RewritePattern):
     """
@@ -340,16 +374,16 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
         self.seen_ops.add(op)
 
         rewriter.insert_op_at_start([
-            t0 := func.Call.get('timer_start', [], [builtin.f64])
+            t0 := func.Call('timer_start', [], [builtin.f64])
         ], op.body.block)
 
         ret = op.get_return_op()
         assert ret is not None
 
         rewriter.insert_op_before([
             timers := iet_ssa.LoadSymbolic.get('timers', llvm.LLVMPointerType.typed(builtin.f64)),
-            t1 := func.Call.get('timer_end', [t0], [builtin.f64]),
-            llvm.StoreOp.get(t1, timers),
+            t1 := func.Call('timer_end', [t0], [builtin.f64]),
+            llvm.StoreOp(t1, timers),
         ], ret)
 
         rewriter.insert_op_after_matched_op([
@@ -405,8 +439,8 @@ def match_and_rewrite(self, op: iet_ssa.Stencil, rewriter: PatternRewriter, /):
 
         for field in op.input_indices:
             rewriter.insert_op_before_matched_op(load_op := stencil.LoadOp.get(field))
-            input_temps.append(load_op.res)
             load_op.res.name_hint = field.name_hint + "_temp"
+            input_temps.insert(0, load_op.res)
 
         rewriter.replace_matched_op(
             [
@@ -479,8 +513,10 @@ def match_and_rewrite(self, op: iet_ssa.LoadSymbolic, rewriter: PatternRewriter,
         if symb_name not in args:
             body = parent.body.blocks[0]
             args[symb_name] = body.insert_arg(op.result.type, len(body.args))
+
 
         op.result.replace_by(args[symb_name])
+
         rewriter.erase_matched_op()
         parent.update_function_type()
         # attach information on parameter names to func
@@ -492,26 +528,28 @@ def match_and_rewrite(self, op: iet_ssa.LoadSymbolic, rewriter: PatternRewriter,
         )
 
 
-def convert_devito_stencil_to_xdsl_stencil(module):
-    grpa = GreedyRewritePatternApplier(
-        [
-            _DevitoStencilToStencilStencil(),
-            LowerIetForToScfFor(),
-            MakeFunctionTimed('apply_kernel'),
+def convert_devito_stencil_to_xdsl_stencil(module, timed:bool=True):
+    patterns:list[RewritePattern] = [
+        _DevitoStencilToStencilStencil(),
+        LowerIetForToScfFor(),
         ]
-    )
+    if timed:
+        patterns.append(MakeFunctionTimed('apply_kernel'))
+    grpa = GreedyRewritePatternApplier(patterns)
     perf("DevitoStencil to stencil.stencil")
     perf("LowerIetForToScfFor")
 
     PatternRewriteWalker(grpa, walk_regions_first=True).rewrite_module(module)
 
 
 
-def finalize_module_with_globals(module: builtin.ModuleOp, known_symbols: dict[str, Any]):
-    grpa = GreedyRewritePatternApplier(
-        [
-            _InsertSymbolicConstants(known_symbols),
-            _LowerLoadSymbolidToFuncArgs(),
-        ]
-    )
+def finalize_module_with_globals(module: builtin.ModuleOp, known_symbols: dict[str, Any], gpu_boilerplate):
+    patterns = [
+        _InsertSymbolicConstants(known_symbols),
+        _LowerLoadSymbolidToFuncArgs(),
+    ]
+    grpa = GreedyRewritePatternApplier(patterns)
     PatternRewriteWalker(grpa).rewrite_module(module)
+    if gpu_boilerplate:
+        walker = PatternRewriteWalker(GreedyRewritePatternApplier([WrapFunctionWithTransfers('apply_kernel')]))
+        walker.rewrite_module(module)
diff --git a/devito/ir/ietxdsl/iet_ssa.py b/devito/ir/ietxdsl/iet_ssa.py
@@ -466,7 +466,7 @@ def get(
             stencil.TempType(len(shape), typ)
         ] * (time_buffers - 1))
 
-        for block_arg, idx_arg in zip(block.args, time_indices):
+        for block_arg, idx_arg in zip(block.args, reversed(inputs)):
             name = SSAValue.get(idx_arg).name_hint
             if name is None:
                 continue

diff --git a/devito/ir/ietxdsl/ietxdsl_functions.py b/devito/ir/ietxdsl/ietxdsl_functions.py
@@ -20,12 +20,11 @@
 # XDSL specific imports
 from xdsl.irdl import AnyOf, Operation, SSAValue
 from xdsl.dialects.builtin import (ContainerOf, Float16Type, Float32Type,
-                                   Float64Type, Builtin, i32, f32)
+                                   Float64Type, i32, f32)
 
-from xdsl.dialects.arith import Muli, Addi
 from devito.ir.ietxdsl import iet_ssa
 
-from xdsl.dialects import memref, arith, builtin, llvm
+from xdsl.dialects import memref, arith, builtin
 from xdsl.dialects.experimental import math
 
 import devito.types
@@ -74,7 +73,7 @@ def print_calls(cgen, calldefs):
             print("Call not translated in calldefs")
             return
 
-        call = Call.get(call_name, C_names, C_typenames, C_typeqs, prefix, retval)
+        call = Call(call_name, C_names, C_typenames, C_typeqs, prefix, retval)
 
         cgen.printCall(call, True)
 
@@ -180,10 +179,10 @@ def add_to_block(expr, arg_by_expr: dict[Any, Operation], result):
             # reconcile differences
 
             if isinstance(rhs.typ, builtin.IntegerType):
-                rhs = arith.SIToFPOp.get(rhs, lhs.typ)
+                rhs = arith.SIToFPOp(rhs, lhs.typ)
                 result.append(rhs)
             else:
-                lhs = arith.SIToFPOp.get(lhs, rhs.typ)
+                lhs = arith.SIToFPOp(lhs, rhs.typ)
                 result.append(lhs)
 
 
@@ -426,7 +425,7 @@ def myVisit(node, block: Block, ssa_vals={}):
             print(f"Call {node.name} instance translated as comment")
             return
 
-        call = Call.get(call_name, C_names, C_typenames, C_typeqs, prefix, retval)
+        call = Call(call_name, C_names, C_typenames, C_typeqs, prefix, retval)
         block.add_ops([call])
 
         print(f"Call {node.name} translated")

diff --git a/devito/ir/ietxdsl/lowering.py b/devito/ir/ietxdsl/lowering.py
@@ -116,6 +116,9 @@ def match_and_rewrite(self, op: iet_ssa.For, rewriter: PatternRewriter, /):
         ]
         rewriter.insert_op_before_matched_op(subindice_vals)
 
+        subindice_vals = list(reversed(subindice_vals))
+        subindice_vals.append(subindice_vals.pop(0))
+
         rewriter.replace_matched_op([
             cst1    := arith.Constant.from_int_and_width(1, builtin.IndexType()),
             new_ub  := arith.Addi(op.ub, cst1),
@@ -368,7 +371,7 @@ def match_and_rewrite(self, op: memref.Store, rewriter: PatternRewriter,
                     ssa_indices=[idx],
                     result_type=llvm.LLVMPointerType.typed(op.memref.memref.element_type)
                 ),
-                store := llvm.StoreOp.get(op.value, gep),
+                store := llvm.StoreOp(op.value, gep),
             ],
             [],
         )

diff --git a/devito/ir/ietxdsl/xdsl_passes.py b/devito/ir/ietxdsl/xdsl_passes.py
@@ -143,7 +143,7 @@ def _op_to_func(op: Operator):
             ietxdsl_functions.myVisit(i, block=block, ssa_vals=ssa_val_dict)
 
     # add a trailing return
-    block.add_op(func.Return.get())
+    block.add_op(func.Return())
 
     func_op = func.FuncOp.from_region(str(op.name), arg_types, [], Region([block]))
 

diff --git a/devito/mpi/distributed.py b/devito/mpi/distributed.py
@@ -1,7 +1,9 @@
+from abc import ABC, abstractmethod
 from ctypes import c_int, c_void_p, sizeof
 from itertools import groupby, product
-from math import ceil
-from abc import ABC, abstractmethod
+from math import ceil, pow
+from sympy import factorint
+
 import atexit
 
 from cached_property import cached_property
@@ -194,6 +196,9 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None):
             # mpi4py takes care of that when the object gets out of scope
             self._input_comm = (input_comm or MPI.COMM_WORLD).Clone()
 
+            if len(shape) == 3:
+                topology = ('*', '*', 1)
+
             if topology is None:
                 # `MPI.Compute_dims` sets the dimension sizes to be as close to each other
                 # as possible, using an appropriate divisibility algorithm. Thus, in 3D:
@@ -204,6 +209,9 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None):
                 # guarantee that 9 ranks are arranged into a 3x3 grid when shape=(9, 9))
                 self._topology = compute_dims(self._input_comm.size, len(shape))
             else:
+                # A custom topology may contain integers or the wildcard '*'
+                topology = CustomTopology(topology, self._input_comm)
+
                 self._topology = topology
 
             if self._input_comm is not input_comm:
@@ -253,9 +261,18 @@ def nprocs(self):
     def topology(self):
         return self._topology
 
+    @property
+    def topology_logical(self):
+        if isinstance(self.topology, CustomTopology):
+            return self.topology.logical
+        else:
+            return None
+
     @cached_property
     def is_boundary_rank(self):
-        """ MPI rank interfaces with the boundary of the domain. """
+        """
+        MPI rank interfaces with the boundary of the domain.
+        """
         return any([True if i == 0 or i == j-1 else False for i, j in
                    zip(self.mycoords, self.topology)])
 
@@ -550,6 +567,99 @@ def _arg_values(self, *args, **kwargs):
             return self._arg_defaults()
 
 
+class CustomTopology(tuple):
+
+    """
+    The CustomTopology class provides a mechanism to describe parametric domain
+    decompositions. It allows users to specify how the dimensions of a domain are
+    decomposed into chunks based on certain parameters.
+
+    Examples
+    --------
+    For example, let's consider a domain with three distributed dimensions: x, y, and z,
+    and an MPI communicator with N processes. Here are a few examples of CustomTopology:
+
+    With N known, say N=4:
+    * `(1, 1, 4)`: the z Dimension is decomposed into 4 chunks
+    * `(2, 1, 2)`: the x Dimension is decomposed into 2 chunks and the z Dimension
+                   is decomposed into 2 chunks
+
+    With N unknown:
+    * `(1, '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose the y
+                     Dimension into N chunks
+    * `('*', '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose both
+                       the x and y Dimensions in `nstars` factors of N, prioritizing
+                       the outermost dimension
+
+    Assuming that the number of ranks `N` cannot evenly be decomposed to the requested
+    stars=6 we decompose as evenly as possible by prioritising the outermost dimension
+
+    For N=3
+    * `('*', '*', 1)` gives: (3, 1, 1)
+    * `('*', 1, '*')` gives: (3, 1, 1)
+    * `(1, '*', '*')` gives: (1, 3, 1)
+
+    For N=6
+    * `('*', '*', 1)` gives: (3, 2, 1)
+    * `('*', 1, '*')` gives: (3, 1, 2)
+    * `(1, '*', '*')` gives: (1, 3, 2)
+
+    For N=8
+    * `('*', '*', '*')` gives: (2, 2, 2)
+    * `('*', '*', 1)` gives: (4, 2, 1)
+    * `('*', 1, '*')` gives: (4, 1, 2)
+    * `(1, '*', '*')` gives: (1, 4, 2)
+
+    Notes
+    -----
+    Users should not directly use the CustomTopology class. It is instantiated
+    by the Devito runtime based on user input.
+    """
+
+    def __new__(cls, items, input_comm):
+        # Keep track of nstars and already defined decompositions
+        nstars = items.count('*')
+
+        # If no stars exist we are ready
+        if nstars == 0:
+            processed = items
+        else:
+            # Init decomposition list and track star positions
+            processed = [1] * len(items)
+            star_pos = []
+            for i, item in enumerate(items):
+                if isinstance(item, int):
+                    processed[i] = item
+                else:
+                    star_pos.append(i)
+
+            # Compute the remaining procs to be allocated
+            alloc_procs = np.prod([i for i in items if i != '*'])
+            rem_procs = int(input_comm.size // alloc_procs)
+
+            # List of all factors of rem_procs in decreasing order
+            factors = factorint(rem_procs)
+            vals = [k for (k, v) in factors.items() for _ in range(v)][::-1]
+
+            # Split in number of stars
+            split = np.array_split(vals, nstars)
+
+            # Reduce
+            star_vals = [int(np.prod(s)) for s in split]
+
+            # Apply computed star values to the processed
+            for index, value in zip(star_pos, star_vals):
+                processed[index] = value
+
+        # Final check that topology matches the communicator size
+        assert np.prod(processed) == input_comm.size
+
+        obj = super().__new__(cls, processed)
+        obj.logical = items
+
+        return obj
+
+
 def compute_dims(nprocs, ndim):
     # We don't do anything clever here. In fact, we do something very basic --
     # we just try to distribute `nprocs` evenly over the number of dimensions,