xdslproject · AntonLydike · Oct 3, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023
diff --git a/devito/ir/ietxdsl/cluster_to_ssa.py b/devito/ir/ietxdsl/cluster_to_ssa.py
@@ -113,7 +113,7 @@ def _convert_eq(self, eq: LoweredEq):
         ), f"can only write to offset [0,0,0], given {offsets[1:]}"
 
         self.block.add_op(stencil.ReturnOp.get([rhs_result]))
-        outermost_block.add_op(func.Return.get())
+        outermost_block.add_op(func.Return())
 
         return func.FuncOp.from_region(
             "apply_kernel", [], [], Region([outermost_block])

diff --git a/devito/ir/ietxdsl/xdsl_passes.py b/devito/ir/ietxdsl/xdsl_passes.py
@@ -143,7 +143,7 @@ def _op_to_func(op: Operator):
             ietxdsl_functions.myVisit(i, block=block, ssa_vals=ssa_val_dict)
 
     # add a trailing return
-    block.add_op(func.Return.get())
+    block.add_op(func.Return())
 
     func_op = func.FuncOp.from_region(str(op.name), arg_types, [], Region([block]))
 

diff --git a/devito/mpi/distributed.py b/devito/mpi/distributed.py
@@ -1,7 +1,9 @@
+from abc import ABC, abstractmethod
 from ctypes import c_int, c_void_p, sizeof
 from itertools import groupby, product
-from math import ceil
-from abc import ABC, abstractmethod
+from math import ceil, pow
+from sympy import factorint
+
 import atexit
 
 from cached_property import cached_property
@@ -194,6 +196,7 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None):
             # mpi4py takes care of that when the object gets out of scope
             self._input_comm = (input_comm or MPI.COMM_WORLD).Clone()
 
+            topology = ('*', '*', 1)
             if topology is None:
                 # `MPI.Compute_dims` sets the dimension sizes to be as close to each other
                 # as possible, using an appropriate divisibility algorithm. Thus, in 3D:
@@ -204,6 +207,9 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None):
                 # guarantee that 9 ranks are arranged into a 3x3 grid when shape=(9, 9))
                 self._topology = compute_dims(self._input_comm.size, len(shape))
             else:
+                # A custom topology may contain integers or the wildcard '*'
+                topology = CustomTopology(topology, self._input_comm)
+
                 self._topology = topology
 
             if self._input_comm is not input_comm:
@@ -253,9 +259,18 @@ def nprocs(self):
     def topology(self):
         return self._topology
 
+    @property
+    def topology_logical(self):
+        if isinstance(self.topology, CustomTopology):
+            return self.topology.logical
+        else:
+            return None
+
     @cached_property
     def is_boundary_rank(self):
-        """ MPI rank interfaces with the boundary of the domain. """
+        """
+        MPI rank interfaces with the boundary of the domain.
+        """
         return any([True if i == 0 or i == j-1 else False for i, j in
                    zip(self.mycoords, self.topology)])
 
@@ -550,6 +565,99 @@ def _arg_values(self, *args, **kwargs):
             return self._arg_defaults()
 
 
+class CustomTopology(tuple):
+
+    """
+    The CustomTopology class provides a mechanism to describe parametric domain
+    decompositions. It allows users to specify how the dimensions of a domain are
+    decomposed into chunks based on certain parameters.
+
+    Examples
+    --------
+    For example, let's consider a domain with three distributed dimensions: x, y, and z,
+    and an MPI communicator with N processes. Here are a few examples of CustomTopology:
+
+    With N known, say N=4:
+    * `(1, 1, 4)`: the z Dimension is decomposed into 4 chunks
+    * `(2, 1, 2)`: the x Dimension is decomposed into 2 chunks and the z Dimension
+                   is decomposed into 2 chunks
+
+    With N unknown:
+    * `(1, '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose the y
+                     Dimension into N chunks
+    * `('*', '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose both
+                       the x and y Dimensions in `nstars` factors of N, prioritizing
+                       the outermost dimension
+
+    Assuming that the number of ranks `N` cannot evenly be decomposed to the requested
+    stars=6 we decompose as evenly as possible by prioritising the outermost dimension
+
+    For N=3
+    * `('*', '*', 1)` gives: (3, 1, 1)
+    * `('*', 1, '*')` gives: (3, 1, 1)
+    * `(1, '*', '*')` gives: (1, 3, 1)
+
+    For N=6
+    * `('*', '*', 1)` gives: (3, 2, 1)
+    * `('*', 1, '*')` gives: (3, 1, 2)
+    * `(1, '*', '*')` gives: (1, 3, 2)
+
+    For N=8
+    * `('*', '*', '*')` gives: (2, 2, 2)
+    * `('*', '*', 1)` gives: (4, 2, 1)
+    * `('*', 1, '*')` gives: (4, 1, 2)
+    * `(1, '*', '*')` gives: (1, 4, 2)
+
+    Notes
+    -----
+    Users should not directly use the CustomTopology class. It is instantiated
+    by the Devito runtime based on user input.
+    """
+
+    def __new__(cls, items, input_comm):
+        # Keep track of nstars and already defined decompositions
+        nstars = items.count('*')
+
+        # If no stars exist we are ready
+        if nstars == 0:
+            processed = items
+        else:
+            # Init decomposition list and track star positions
+            processed = [1] * len(items)
+            star_pos = []
+            for i, item in enumerate(items):
+                if isinstance(item, int):
+                    processed[i] = item
+                else:
+                    star_pos.append(i)
+
+            # Compute the remaining procs to be allocated
+            alloc_procs = np.prod([i for i in items if i != '*'])
+            rem_procs = int(input_comm.size // alloc_procs)
+
+            # List of all factors of rem_procs in decreasing order
+            factors = factorint(rem_procs)
+            vals = [k for (k, v) in factors.items() for _ in range(v)][::-1]
+
+            # Split in number of stars
+            split = np.array_split(vals, nstars)
+
+            # Reduce
+            star_vals = [int(np.prod(s)) for s in split]
+
+            # Apply computed star values to the processed
+            for index, value in zip(star_pos, star_vals):
+                processed[index] = value
+
+        # Final check that topology matches the communicator size
+        assert np.prod(processed) == input_comm.size
+
+        obj = super().__new__(cls, processed)
+        obj.logical = items
+
+        return obj
+
+
 def compute_dims(nprocs, ndim):
     # We don't do anything clever here. In fact, we do something very basic --
     # we just try to distribute `nprocs` evenly over the number of dimensions,

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
@@ -138,6 +138,11 @@ def _jit_compile(self):
                 with open(backdoor, 'r') as f:
                     module_str = f.read()
 
+            source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
+            source_file = open(source_name, "w")
+            source_file.write(module_str)
+            source_file.close()
+
             # compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
             try:
                 cflags = CFLAGS
@@ -151,17 +156,21 @@ def _jit_compile(self):
                 if is_gpu:
                     cflags += " -lmlir_cuda_runtime "
 
-                cmd = f'xdsl-opt -p {xdsl_pipeline} |' \
+                # TODO More detailed error handling manually,
+                # instead of relying on a bash-only feature.
+                cmd = 'set -eo pipefail; '\
+                    f'xdsl-opt {source_name} -p {xdsl_pipeline} |' \
                     f'mlir-opt -p {mlir_pipeline} | ' \
                     f'mlir-translate --mlir-to-llvmir | ' \
                     f'{cc} {cflags} -shared -o {self._tf.name} {self._interop_tf.name} -xir -'
 
+                print(cmd)
                 res = subprocess.run(
                     cmd,
                     shell=True,
-                    input=module_str,
                     text=True,
                     capture_output=True,
+                    executable="/bin/bash"
                 )
 
                 if res.returncode != 0:
@@ -172,13 +181,13 @@ def _jit_compile(self):
             except Exception as ex:
                 print("error")
                 raise ex
-            #print(res.stderr)
+            # print(res.stderr)
 
         elapsed = self._profiler.py_timers['jit-compile']
-        
+
         perf("XDSLOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
-                    (self.name, self._tf.name, elapsed))
-        
+             (self.name, source_name, elapsed))
+
     @property
     def _soname(self):
         return self._tf.name

diff --git a/fast/diffusion_2D_wBCs.py b/fast/diffusion_2D_wBCs.py
@@ -42,12 +42,9 @@
 u = TimeFunction(name='u', grid=grid, space_order=so)
 
 # Reset our data field and ICs
-# u.data[:, :, :] = 0.1
 init_hat(field=u.data[0], dx=dx, dy=dy, value=1.)
 
 
-# u.data[0, :, int(ny/2)] = 2
-
 a = Constant(name='a')
 # Create an equation with second-order derivatives
 eq = Eq(u.dt, a * u.laplace, subdomain=grid.interior)
@@ -58,23 +55,9 @@
 x, y = grid.dimensions
 t = grid.stepping_dim
 
-# Add boundary conditions
-# bc = [Eq(u[t+1, x, y, 0], 2.)]  # bottom
-# bc += [Eq(u[t+1, x, y, nz-1], 2.)]  # top
-# bc += [Eq(u[t+1, 0, y, z], 2.)]  # left
-# bc += [Eq(u[t+1, nx-1, y, z], 2.)]  # right
-
-# bc += [Eq(u[t+1, x, 0, z], 2.)]  # front
-# bc += [Eq(u[t+1, x, ny-1, z], 2.)]  # back
-
 print(eq_stencil)
 
 # Create an operator that updates the forward stencil point
-# plus adding boundary conditions
-# op = Operator([eq_stencil] + bc, subdomain=grid.interior)
-
-# No BCs
-# op = XDSLOperator([eq_stencil])
 op = Operator([eq_stencil])
 # print(op.ccode)
 # dt = 0.00002
@@ -83,8 +66,15 @@
 print(dt)
 op.apply(time=nt, dt=dt, a=nu)
 
-print("Field norm is:", norm(u))
+print("Devito Field norm is:", norm(u))
 
 plot_image(u.data[0], cmap="seismic")
 
-# import pdb;pdb.set_trace()
+
+# Reset our data field and ICs
+init_hat(field=u.data[0], dx=dx, dy=dy, value=1.)
+xdslop = XDSLOperator([eq_stencil])
+xdslop.apply(time=nt, dt=dt, a=nu)
+plot_image(u.data[0], cmap="seismic")
+
+print("xDSL Field norm is:", norm(u))
diff --git a/fast/diffusion_3D_wBCs.py b/fast/diffusion_3D_wBCs.py
@@ -31,7 +31,7 @@ def plot_3dfunc(u):
 
     cmap = plt.cm.get_cmap("viridis")
     values = u.data[0, :, :, :]
-    vistagrid = pv.UniformGrid()
+    vistagrid = pv.ImageData()
     vistagrid.dimensions = np.array(values.shape) + 1
     vistagrid.spacing = (1, 1, 1)
     vistagrid.origin = (0, 0, 0)  # The bottom left corner of the data set
@@ -73,31 +73,27 @@ def plot_3dfunc(u):
 x, y, z = grid.dimensions
 t = grid.stepping_dim
 
-# Add boundary conditions
-# bc = [Eq(u[t+1, x, y, 0], 2.)]  # bottom
-# bc += [Eq(u[t+1, x, y, nz-1], 2.)]  # top
-# bc += [Eq(u[t+1, 0, y, z], 2.)]  # left
-# bc += [Eq(u[t+1, nx-1, y, z], 2.)]  # right
-
-# bc += [Eq(u[t+1, x, 0, z], 2.)]  # front
-# bc += [Eq(u[t+1, x, ny-1, z], 2.)]  # back
-
 print(eq_stencil)
 
-# Create an operator that updates the forward stencil point
-# plus adding boundary conditions
-# op = Operator([eq_stencil] + bc, subdomain=grid.interior)
-
 # No BCs
-# op = XDSLOperator([eq_stencil])
 op = Operator([eq_stencil])
-# print(op.ccode)
 
 # Apply the operator for a number of timesteps
 op.apply(time=nt, dt=dt, a=nu)
 
 if args.plot:
     plot_3dfunc(u)
 
-print("Field norm is:", norm(u))
-# import pdb;pdb.set_trace()
+print("Devito Field norm is:", norm(u))
+
+# Reset data for XDSL
+u.data[:, :, :, :] = 0
+u.data[:, :, :, int(nz/2)] = 1
+
+xdslop = XDSLOperator([eq_stencil])
+xdslop.apply(time=nt, dt=dt, a=nu)
+
+if args.plot:
+    plot_3dfunc(u)
+
+print("xDSL Field norm is:", norm(u))