From 06dc480b521ecfdce89d30a290bb1e92ea220c10 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 19 Aug 2024 12:43:27 +0100
Subject: [PATCH 01/25] !!!

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 30 ++++++++++++----------------
 devito/xdsl_core/xdsl_cpu.py         | 17 +++++++++++++---
 tests/test_xdsl_base.py              |  3 ++-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index 37713b5322..48d2489d12 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -287,7 +287,7 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             SSAargs = (self._visit_math_nodes(dim, arg, output_indexed)
                        for arg in node.args)
             return reduce(lambda x, y : arith.AndI(x, y).result, SSAargs)
-        
+
         # Trigonometric functions
         elif isinstance(node, sin):
             assert len(node.args) == 1, "Expected single argument for sin."
@@ -298,13 +298,13 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             assert len(node.args) == 1, "Expected single argument for cos."           
             return math.CosOp(self._visit_math_nodes(dim, node.args[0],
                               output_indexed)).result
-        
+
         elif isinstance(node, tan):
             assert len(node.args) == 1, "Expected single argument for TanOp."
-            
+
             return math.TanOp(self._visit_math_nodes(dim, node.args[0],
                               output_indexed)).result
-                   
+
         elif isinstance(node, Relational):
             if isinstance(node, GreaterThan):
                 mnemonic = "sge"
@@ -391,12 +391,10 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
             apply.res[0],
             self.function_values[self.out_time_buffer],
             stencil.StencilBoundsAttr(zip(lb, ub)),
-            stencil.TempType(len(shape),
-                             element_type=dtype_to_xdsltype(write_function.dtype))
         )
-
-        store.temp_with_halo.name_hint = f"{write_function.name}_t{self.out_time_buffer[1]}_temp"  # noqa
-        self.temps[self.out_time_buffer] = store.temp_with_halo
+        load = stencil.LoadOp.get(self.function_values[self.out_time_buffer])
+        load.res.name_hint = f"{write_function.name}_t{self.out_time_buffer[1]}_temp"  # noqa
+        self.temps[self.out_time_buffer] = load.res
 
     def build_generic_step_expression(self, dim: SteppingDimension, eq: LoweredEq):
         # Sources
@@ -439,7 +437,6 @@ def build_condition(self, dim: SteppingDimension, eq: BooleanFunction):
             self.build_generic_step_expression(dim, eq)
             scf.Yield()
 
-
     def build_time_loop(
         self, eqs: list[Any], step_dim: SteppingDimension, **kwargs
     ):
@@ -450,7 +447,7 @@ def build_time_loop(
         ub = iet_ssa.LoadSymbolic.get(
             step_dim.symbolic_max._C_name, IndexType()
         )
-        
+
         one = arith.Constant.from_int_and_width(1, IndexType())
 
         # Devito iterates from time_m to time_M *inclusive*, MLIR only takes
@@ -497,7 +494,7 @@ def build_time_loop(
             for i, (f, t) in enumerate(self.time_buffers)
         }
         self.function_values |= self.block_args
-        
+
         # Name the block argument for debugging
         for (f, t), arg in self.block_args.items():
             arg.name_hint = f"{f.name}_t{t}"
@@ -513,8 +510,7 @@ def build_time_loop(
 
     def lower_devito_Eqs(self, eqs: list[Any], **kwargs):
         # Lower devito Equations to xDSL
-        
-        
+
         for eq in eqs:
             lowered = self.operator._lower_exprs(as_tuple(eq), **kwargs)
             if isinstance(eq, Eq):
@@ -546,7 +542,7 @@ def _lower_injection(self, eqs: list[LoweredEq]):
                 lb = arith.Constant.from_int_and_width(int(lower), IndexType())
             else:
                 raise NotImplementedError(f"Lower bound of type {type(lower)} not supported")
-            
+
             try:
                 name = interval.dim.symbolic_min.name
             except:
@@ -633,7 +629,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp:
         # Instantiate the module.
         self.function_values: dict[tuple[Function, int], SSAValue] = {}
         self.symbol_values: dict[str, SSAValue] = {}
-        
+
         module = ModuleOp(Region([block := Block([])]))
         with ImplicitBuilder(block):
             # Get all functions used in the equations
@@ -647,7 +643,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp:
                         functions.add(f.function)
 
                 elif isinstance(eq, Injection):
-                    
+
                     functions.add(eq.field.function)
                     for f in retrieve_functions(eq.expr):
                         if isinstance(f, PointSource):
diff --git a/devito/xdsl_core/xdsl_cpu.py b/devito/xdsl_core/xdsl_cpu.py
index cd86fc0d9c..e56d6a9a5d 100644
--- a/devito/xdsl_core/xdsl_cpu.py
+++ b/devito/xdsl_core/xdsl_cpu.py
@@ -460,6 +460,7 @@ def _jit_compile(self):
                 # Run the first pipeline, mostly xDSL-centric
                 xdsl_args = [source_name,
                              "--allow-unregistered-dialect",
+                             "--disable-verify",
                              "-p",
                              xdsl_pipeline[1:-1],]
                 # We use the Python API to run xDSL rather than a subprocess
@@ -597,7 +598,10 @@ def generate_MLIR_OPENMP_PIPELINE(kwargs):
 
 def generate_XDSL_CPU_PIPELINE(nb_tiled_dims):
     passes = [
-        "stencil-shape-inference",
+        "canonicalize",
+        "cse",
+        "shape-inference",
+        "stencil-bufferize",
         "convert-stencil-to-ll-mlir",
         f"scf-parallel-loop-tiling{{{generate_tiling_arg(nb_tiled_dims)}}}",
         "printf-to-llvm",
@@ -609,7 +613,10 @@ def generate_XDSL_CPU_PIPELINE(nb_tiled_dims):
 
 def generate_XDSL_CPU_noop_PIPELINE():
     passes = [
-        "stencil-shape-inference",
+        "canonicalize",
+        "cse",
+        "shape-inference",
+        "stencil-bufferize",
         "convert-stencil-to-ll-mlir",
         "printf-to-llvm"
     ]
@@ -619,11 +626,15 @@ def generate_XDSL_CPU_noop_PIPELINE():
 
 def generate_XDSL_MPI_PIPELINE(decomp, nb_tiled_dims):
     passes = [
+        "canonicalize",
+        "cse",
         f"distribute-stencil{decomp}",
+        "shape-inference",
         "canonicalize-dmp",
+        "stencil-bufferize",
+        "dmp-to-mpi{mpi_init=false}",
         "convert-stencil-to-ll-mlir",
         f"scf-parallel-loop-tiling{{{generate_tiling_arg(nb_tiled_dims)}}}",
-        "dmp-to-mpi{mpi_init=false}",
         "lower-mpi",
         "printf-to-llvm",
         "canonicalize"
diff --git a/tests/test_xdsl_base.py b/tests/test_xdsl_base.py
index 39aa97828b..72ddfcb16a 100644
--- a/tests/test_xdsl_base.py
+++ b/tests/test_xdsl_base.py
@@ -73,7 +73,8 @@ def test_xdsl_III():
     assert isinstance(scffor_ops[0], LoadOp)
     assert isinstance(scffor_ops[1], ApplyOp)
     assert isinstance(scffor_ops[2], StoreOp)
-    assert isinstance(scffor_ops[3], Yield)
+    assert isinstance(scffor_ops[3], LoadOp)
+    assert isinstance(scffor_ops[4], Yield)
 
     assert type(ops[7] == Call)
     assert type(ops[8] == StoreOp)

From 0ab86cab8d117ca72c9b7a65dd704d816fb72acb Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 19 Aug 2024 12:45:36 +0100
Subject: [PATCH 02/25] (CI hack) use new temp xDSL commits with latest tweaks.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 94f7821666..1979a22cc6 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@f8bb935880276cf077e0a80f1905105d0a98eb33
+        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index b1000d71d5..0193daf44b 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f8bb935880276cf077e0a80f1905105d0a98eb33
+        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index e8c086a5b7..7ebcfe6fa2 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f8bb935880276cf077e0a80f1905105d0a98eb33
+        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index aa36a701a5..0967be3216 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f8bb935880276cf077e0a80f1905105d0a98eb33
+        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 1b8df3226e..7b5245ff37 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@f8bb935880276cf077e0a80f1905105d0a98eb33
+        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
         
     - name: Test no-MPI, no-Openmp
       run: |

From dc2ee6583a1dc9677ece60ed6369d7f739224e03 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 13:43:26 +0100
Subject: [PATCH 03/25] Try with hopefully fixed extract_strided_metadata.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 1979a22cc6..3423c29f2c 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
+        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 0193daf44b..35f42f8822 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
+        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 7ebcfe6fa2..bb5d518367 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
+        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 0967be3216..1df134168a 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
+        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 7b5245ff37..b010fdcc37 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@540da57a7bcc5f05d3b98da0ea1f88420a3dbdf0
+        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
         
     - name: Test no-MPI, no-Openmp
       run: |

From 4e6452a2f65ffe267a9d859d837e96c546323b4f Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 14:08:19 +0100
Subject: [PATCH 04/25] Post module __init__ fix.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 tests/filecheck/.lit_test_times.txt      | 9 +++++++++
 6 files changed, 14 insertions(+), 5 deletions(-)
 create mode 100644 tests/filecheck/.lit_test_times.txt

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 3423c29f2c..83a3852517 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
+        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 35f42f8822..5a8cacaf51 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
+        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index bb5d518367..3abe00aabc 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
+        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 1df134168a..976c3fc72d 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
+        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index b010fdcc37..606de0956d 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@0a2420cdf0f337f8237ec142171d5c8f046daf19
+        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
         
     - name: Test no-MPI, no-Openmp
       run: |
diff --git a/tests/filecheck/.lit_test_times.txt b/tests/filecheck/.lit_test_times.txt
new file mode 100644
index 0000000000..4d1f2a8c89
--- /dev/null
+++ b/tests/filecheck/.lit_test_times.txt
@@ -0,0 +1,9 @@
+-8.160377e-02 shape_inference.mlir
+8.171344e-02 version.mlir
+-1.473970e-01 xdsl_mpi_pipeline.mlir
+-1.490667e-01 xdsl_mpi_pipeline_b.mlir
+-1.464252e-01 xdsl_mpi_pipeline_c.mlir
+-1.516540e-01 xdsl_mpi_pipeline_d.mlir
+-1.704619e-01 xdsl_mpi_pipeline_e.mlir
+-1.000817e-01 xdsl_pipeline.mlir
+-1.596556e-01 xdsl_pipeline_openmp.mlir

From a0cddd70b9a5efda293e0b8376677af65f083f00 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 16:31:33 +0100
Subject: [PATCH 05/25] More bump.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 83a3852517..8d88f522e6 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
+        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 5a8cacaf51..9058572c27 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
+        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 3abe00aabc..b208df760a 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
+        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 976c3fc72d..d5ca62ecdf 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
+        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 606de0956d..bde4de750f 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@306c5e00e3f715ae6710467bb1179e9fb3f394c8
+        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
         
     - name: Test no-MPI, no-Openmp
       run: |

From 53095385016b8255d6b2a953eed9a640df6fd51a Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 16:56:41 +0100
Subject: [PATCH 06/25] Temporarily disable verify on noop too.

---
 devito/xdsl_core/xdsl_cpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/devito/xdsl_core/xdsl_cpu.py b/devito/xdsl_core/xdsl_cpu.py
index e56d6a9a5d..7fb22e4e21 100644
--- a/devito/xdsl_core/xdsl_cpu.py
+++ b/devito/xdsl_core/xdsl_cpu.py
@@ -184,6 +184,7 @@ def _jit_compile(self):
                 xdsl_args = [source_name,
                              "--allow-unregistered-dialect",
                              "-p",
+                             "--disable-verify",
                              xdsl_pipeline[1:-1],]
                 # We use the Python API to run xDSL rather than a subprocess
                 # This avoids reimport overhead

From 2c8dfaec1c06b090b36b8b6490e1ce78c0b6168d Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 17:09:13 +0100
Subject: [PATCH 07/25] Another bump on the verifier relaxation.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 devito/xdsl_core/xdsl_cpu.py             | 2 --
 6 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 8d88f522e6..a187172c3e 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
+        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 9058572c27..7cd7d7cbb1 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
+        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index b208df760a..6dd145604e 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
+        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index d5ca62ecdf..39bc907225 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
+        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index bde4de750f..796882d73e 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@f6fdfb86d20f39f2d0e2e3f76900a1013bc5ef79
+        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
         
     - name: Test no-MPI, no-Openmp
       run: |
diff --git a/devito/xdsl_core/xdsl_cpu.py b/devito/xdsl_core/xdsl_cpu.py
index 7fb22e4e21..f9c4e66769 100644
--- a/devito/xdsl_core/xdsl_cpu.py
+++ b/devito/xdsl_core/xdsl_cpu.py
@@ -184,7 +184,6 @@ def _jit_compile(self):
                 xdsl_args = [source_name,
                              "--allow-unregistered-dialect",
                              "-p",
-                             "--disable-verify",
                              xdsl_pipeline[1:-1],]
                 # We use the Python API to run xDSL rather than a subprocess
                 # This avoids reimport overhead
@@ -461,7 +460,6 @@ def _jit_compile(self):
                 # Run the first pipeline, mostly xDSL-centric
                 xdsl_args = [source_name,
                              "--allow-unregistered-dialect",
-                             "--disable-verify",
                              "-p",
                              xdsl_pipeline[1:-1],]
                 # We use the Python API to run xDSL rather than a subprocess

From 11e9885849c15591ae0d96f65f8f4f33b1263ada Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 17:13:49 +0100
Subject: [PATCH 08/25] Syntax update.

---
 devito/xdsl_core/xdsl_gpu.py             |  2 +-
 tests/filecheck/.lit_test_times.txt      | 18 +++++++++---------
 tests/filecheck/shape_inference.mlir     |  6 +++---
 tests/filecheck/xdsl_mpi_pipeline.mlir   |  4 ++--
 tests/filecheck/xdsl_mpi_pipeline_b.mlir |  4 ++--
 tests/filecheck/xdsl_mpi_pipeline_c.mlir |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_d.mlir |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_e.mlir |  2 +-
 tests/filecheck/xdsl_pipeline.mlir       |  4 ++--
 9 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/devito/xdsl_core/xdsl_gpu.py b/devito/xdsl_core/xdsl_gpu.py
index 41bd08e084..9f281015b3 100644
--- a/devito/xdsl_core/xdsl_gpu.py
+++ b/devito/xdsl_core/xdsl_gpu.py
@@ -140,7 +140,7 @@ def _jit_compile(self):
 
 def generate_XDSL_GPU_PIPELINE():
     passes = [
-        "stencil-shape-inference",
+        "shape-inference",
         "convert-stencil-to-ll-mlir",
         "reconcile-unrealized-casts",
         "printf-to-llvm",
diff --git a/tests/filecheck/.lit_test_times.txt b/tests/filecheck/.lit_test_times.txt
index 4d1f2a8c89..84be361be3 100644
--- a/tests/filecheck/.lit_test_times.txt
+++ b/tests/filecheck/.lit_test_times.txt
@@ -1,9 +1,9 @@
--8.160377e-02 shape_inference.mlir
-8.171344e-02 version.mlir
--1.473970e-01 xdsl_mpi_pipeline.mlir
--1.490667e-01 xdsl_mpi_pipeline_b.mlir
--1.464252e-01 xdsl_mpi_pipeline_c.mlir
--1.516540e-01 xdsl_mpi_pipeline_d.mlir
--1.704619e-01 xdsl_mpi_pipeline_e.mlir
--1.000817e-01 xdsl_pipeline.mlir
--1.596556e-01 xdsl_pipeline_openmp.mlir
+1.627717e-01 shape_inference.mlir
+9.103966e-02 version.mlir
+-1.920681e-01 xdsl_mpi_pipeline.mlir
+-1.922677e-01 xdsl_mpi_pipeline_b.mlir
+-1.886339e-01 xdsl_mpi_pipeline_c.mlir
+-2.020643e-01 xdsl_mpi_pipeline_d.mlir
+-2.121959e-01 xdsl_mpi_pipeline_e.mlir
+-2.791779e-01 xdsl_pipeline.mlir
+-1.644087e-01 xdsl_pipeline_openmp.mlir
diff --git a/tests/filecheck/shape_inference.mlir b/tests/filecheck/shape_inference.mlir
index 77194b9944..764541d970 100644
--- a/tests/filecheck/shape_inference.mlir
+++ b/tests/filecheck/shape_inference.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p stencil-shape-inference %s | filecheck %s
+// RUN: xdsl-opt -p shape-inference %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%f2_vec0 : !stencil.field<[-2,5]x[-2,5]xf32>, %f2_vec1 : !stencil.field<[-2,5]x[-2,5]xf32>, %timers : !llvm.ptr) {
@@ -64,7 +64,7 @@ builtin.module {
         %47 = arith.mulf %46, %dt_1 : f32
         stencil.return %47 : f32
       }
-      %f2_t1_temp_1 = stencil.store %f2_t1_temp to %f2_t1 ([0, 0] : [3, 3]) : !stencil.temp<?x?xf32> to !stencil.field<[-2,5]x[-2,5]xf32> with_halo : !stencil.temp<?x?xf32>
+      stencil.store %f2_t1_temp to %f2_t1(<[0, 0], [3, 3]>)  : !stencil.temp<?x?xf32> to !stencil.field<[-2,5]x[-2,5]xf32>
       scf.yield %f2_t1, %f2_t0 : !stencil.field<[-2,5]x[-2,5]xf32>, !stencil.field<[-2,5]x[-2,5]xf32>
     }
     %5 = func.call @timer_end(%0) : (f64) -> f64
@@ -139,7 +139,7 @@ builtin.module {
 // CHECK-NEXT:          %47 = arith.mulf %46, %dt_1 : f32
 // CHECK-NEXT:          stencil.return %47 : f32
 // CHECK-NEXT:        }
-// CHECK-NEXT:        %f2_t1_temp_1 = stencil.store %f2_t1_temp to %f2_t1 ([0, 0] : [3, 3]) : !stencil.temp<[0,3]x[0,3]xf32> to !stencil.field<[-2,5]x[-2,5]xf32> with_halo : !stencil.temp<?x?xf32>
+// CHECK-NEXT:        stencil.store %f2_t1_temp to %f2_t1(<[0, 0], [3, 3]>)  : !stencil.temp<[0,3]x[0,3]xf32> to !stencil.field<[-2,5]x[-2,5]xf32>
 // CHECK-NEXT:        scf.yield %f2_t1, %f2_t0 : !stencil.field<[-2,5]x[-2,5]xf32>, !stencil.field<[-2,5]x[-2,5]xf32>
 // CHECK-NEXT:      }
 // CHECK-NEXT:      %5 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_mpi_pipeline.mlir b/tests/filecheck/xdsl_mpi_pipeline.mlir
index c651e5a984..d4c986a0b1 100644
--- a/tests/filecheck/xdsl_mpi_pipeline.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline.mlir
@@ -94,7 +94,7 @@ builtin.module {
         %73 = arith.mulf %7, %72 : f32
         stencil.return %73 : f32
       }
-      %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+      stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
       scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
     }
     %6 = func.call @timer_end(%0) : (f64) -> f64
@@ -202,7 +202,7 @@ builtin.module {
 // CHECK-NEXT:          %73 = arith.mulf %7, %72 : f32
 // CHECK-NEXT:          stencil.return %73 : f32
 // CHECK-NEXT:        }
-// CHECK-NEXT:        %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:      }
 // CHECK-NEXT:      %6 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_mpi_pipeline_b.mlir b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
index 035c313aa2..b27625d132 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_b.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
@@ -94,7 +94,7 @@ builtin.module {
         %73 = arith.mulf %7, %72 : f32
         stencil.return %73 : f32
       }
-      %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+      stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
       scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
     }
     %6 = func.call @timer_end(%0) : (f64) -> f64
@@ -201,7 +201,7 @@ builtin.module {
 // CHECK-NEXT:          %73 = arith.mulf %7, %72 : f32
 // CHECK-NEXT:          stencil.return %73 : f32
 // CHECK-NEXT:        }
-// CHECK-NEXT:        %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:      }
 // CHECK-NEXT:      %6 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_mpi_pipeline_c.mlir b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
index d4eca46f97..93ba65c468 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_c.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
@@ -94,7 +94,7 @@ builtin.module {
         %73 = arith.mulf %7, %72 : f32
         stencil.return %73 : f32
       }
-      %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+      stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
       scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
     }
     %6 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_mpi_pipeline_d.mlir b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
index 08f3f92b2f..b02f1d4baf 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_d.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
@@ -94,7 +94,7 @@ builtin.module {
         %73 = arith.mulf %7, %72 : f32
         stencil.return %73 : f32
       }
-      %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+      stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
       scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
     }
     %6 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_mpi_pipeline_e.mlir b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
index 82793b8062..3c230e1125 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_e.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
@@ -94,7 +94,7 @@ builtin.module {
         %73 = arith.mulf %7, %72 : f32
         stencil.return %73 : f32
       }
-      %u_t1_temp_1 = stencil.store %u_t1_temp to %u_t1 ([0, 0, 0] : [51, 101, 101]) : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> with_halo : !stencil.temp<?x?x?xf32>
+      stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<?x?x?xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
       scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
     }
     %6 = func.call @timer_end(%0) : (f64) -> f64
diff --git a/tests/filecheck/xdsl_pipeline.mlir b/tests/filecheck/xdsl_pipeline.mlir
index 5243a3ca7c..726f57e097 100644
--- a/tests/filecheck/xdsl_pipeline.mlir
+++ b/tests/filecheck/xdsl_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p stencil-shape-inference,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize %s | filecheck %s
+// RUN: xdsl-opt -p shape-inference,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%f2_vec0 : !stencil.field<[-2,5]x[-2,5]xf32>, %f2_vec1 : !stencil.field<[-2,5]x[-2,5]xf32>, %timers : !llvm.ptr) {
@@ -64,7 +64,7 @@ builtin.module {
         %47 = arith.mulf %46, %dt_1 : f32
         stencil.return %47 : f32
       }
-      %f2_t1_temp_1 = stencil.store %f2_t1_temp to %f2_t1 ([0, 0] : [3, 3]) : !stencil.temp<?x?xf32> to !stencil.field<[-2,5]x[-2,5]xf32> with_halo : !stencil.temp<?x?xf32>
+      stencil.store %f2_t1_temp to %f2_t1(<[0, 0], [3, 3]>)  : !stencil.temp<?x?xf32> to !stencil.field<[-2,5]x[-2,5]xf32>
       scf.yield %f2_t1, %f2_t0 : !stencil.field<[-2,5]x[-2,5]xf32>, !stencil.field<[-2,5]x[-2,5]xf32>
     }
     %5 = func.call @timer_end(%0) : (f64) -> f64

From 5c15ce99459e596b7092a91ebd570181cfc6a502 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 17:21:30 +0100
Subject: [PATCH 09/25] Filecheck pipelines update.

---
 tests/filecheck/.lit_test_times.txt       | 18 +++++++++---------
 tests/filecheck/xdsl_mpi_pipeline.mlir    |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_b.mlir  |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_c.mlir  |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_d.mlir  |  2 +-
 tests/filecheck/xdsl_mpi_pipeline_e.mlir  |  2 +-
 tests/filecheck/xdsl_pipeline.mlir        |  2 +-
 tests/filecheck/xdsl_pipeline_openmp.mlir |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/filecheck/.lit_test_times.txt b/tests/filecheck/.lit_test_times.txt
index 84be361be3..b1051222fe 100644
--- a/tests/filecheck/.lit_test_times.txt
+++ b/tests/filecheck/.lit_test_times.txt
@@ -1,9 +1,9 @@
-1.627717e-01 shape_inference.mlir
-9.103966e-02 version.mlir
--1.920681e-01 xdsl_mpi_pipeline.mlir
--1.922677e-01 xdsl_mpi_pipeline_b.mlir
--1.886339e-01 xdsl_mpi_pipeline_c.mlir
--2.020643e-01 xdsl_mpi_pipeline_d.mlir
--2.121959e-01 xdsl_mpi_pipeline_e.mlir
--2.791779e-01 xdsl_pipeline.mlir
--1.644087e-01 xdsl_pipeline_openmp.mlir
+1.647806e-01 shape_inference.mlir
+8.123279e-02 version.mlir
+-2.095106e-01 xdsl_mpi_pipeline.mlir
+-2.172942e-01 xdsl_mpi_pipeline_b.mlir
+-7.517524e-01 xdsl_mpi_pipeline_c.mlir
+-7.795422e-01 xdsl_mpi_pipeline_d.mlir
+-8.045800e-01 xdsl_mpi_pipeline_e.mlir
+-2.641258e-01 xdsl_pipeline.mlir
+-1.632233e-01 xdsl_pipeline_openmp.mlir
diff --git a/tests/filecheck/xdsl_mpi_pipeline.mlir b/tests/filecheck/xdsl_mpi_pipeline.mlir
index d4c986a0b1..8cb334cdf3 100644
--- a/tests/filecheck/xdsl_mpi_pipeline.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false}" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_mpi_pipeline_b.mlir b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
index b27625d132..684e714266 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_b.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},canonicalize-dmp" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_mpi_pipeline_c.mlir b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
index 93ba65c468..e94a7d2504 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_c.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},canonicalize-dmp,convert-stencil-to-ll-mlir" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_mpi_pipeline_d.mlir b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
index b02f1d4baf..24460a858d 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_d.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},canonicalize-dmp,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0}" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0}" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_mpi_pipeline_e.mlir b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
index 3c230e1125..14e82658d0 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_e.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},canonicalize-dmp,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0},dmp-to-mpi{mpi_init=false},lower-mpi" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0},dmp-to-mpi{mpi_init=false},lower-mpi" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_pipeline.mlir b/tests/filecheck/xdsl_pipeline.mlir
index 726f57e097..f2981a3a17 100644
--- a/tests/filecheck/xdsl_pipeline.mlir
+++ b/tests/filecheck/xdsl_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p shape-inference,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize %s | filecheck %s
+// RUN: xdsl-opt -p canonicalize,cse,shape-inference,stencil-bufferize,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%f2_vec0 : !stencil.field<[-2,5]x[-2,5]xf32>, %f2_vec1 : !stencil.field<[-2,5]x[-2,5]xf32>, %timers : !llvm.ptr) {
diff --git a/tests/filecheck/xdsl_pipeline_openmp.mlir b/tests/filecheck/xdsl_pipeline_openmp.mlir
index 85005e4770..127763e598 100644
--- a/tests/filecheck/xdsl_pipeline_openmp.mlir
+++ b/tests/filecheck/xdsl_pipeline_openmp.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @xDSLDiffusionOperator(%u_vec0 : memref<158x158x158xf32>, %u_vec1 : memref<158x158x158xf32>, %timers : !llvm.ptr) {

From 6bbe2371f7a0dc460402f3d9f1dcc267dea9f3f5 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 17:30:28 +0100
Subject: [PATCH 10/25] Filecheck updates.

---
 tests/filecheck/.lit_test_times.txt       |  18 +-
 tests/filecheck/xdsl_mpi_pipeline.mlir    | 141 ++--
 tests/filecheck/xdsl_mpi_pipeline_b.mlir  | 121 +--
 tests/filecheck/xdsl_mpi_pipeline_c.mlir  | 425 ++++++++---
 tests/filecheck/xdsl_mpi_pipeline_d.mlir  | 444 +++++++----
 tests/filecheck/xdsl_mpi_pipeline_e.mlir  | 871 +++++++++-------------
 tests/filecheck/xdsl_pipeline.mlir        | 131 ++--
 tests/filecheck/xdsl_pipeline_openmp.mlir | 272 +++----
 8 files changed, 1224 insertions(+), 1199 deletions(-)

diff --git a/tests/filecheck/.lit_test_times.txt b/tests/filecheck/.lit_test_times.txt
index b1051222fe..5b5bf8f863 100644
--- a/tests/filecheck/.lit_test_times.txt
+++ b/tests/filecheck/.lit_test_times.txt
@@ -1,9 +1,9 @@
-1.647806e-01 shape_inference.mlir
-8.123279e-02 version.mlir
--2.095106e-01 xdsl_mpi_pipeline.mlir
--2.172942e-01 xdsl_mpi_pipeline_b.mlir
--7.517524e-01 xdsl_mpi_pipeline_c.mlir
--7.795422e-01 xdsl_mpi_pipeline_d.mlir
--8.045800e-01 xdsl_mpi_pipeline_e.mlir
--2.641258e-01 xdsl_pipeline.mlir
--1.632233e-01 xdsl_pipeline_openmp.mlir
+1.568606e-01 shape_inference.mlir
+8.218265e-02 version.mlir
+2.304106e-01 xdsl_mpi_pipeline.mlir
+2.372591e-01 xdsl_mpi_pipeline_b.mlir
+7.415709e-01 xdsl_mpi_pipeline_c.mlir
+7.533531e-01 xdsl_mpi_pipeline_d.mlir
+7.714086e-01 xdsl_mpi_pipeline_e.mlir
+2.611284e-01 xdsl_pipeline.mlir
+1.698225e-01 xdsl_pipeline_openmp.mlir
diff --git a/tests/filecheck/xdsl_mpi_pipeline.mlir b/tests/filecheck/xdsl_mpi_pipeline.mlir
index 8cb334cdf3..8d62ae8d60 100644
--- a/tests/filecheck/xdsl_mpi_pipeline.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
@@ -111,104 +111,63 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 1 : index
 // CHECK-NEXT:      %time_M = arith.constant 20 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4, %5 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (!stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>) {
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_m : index
+// CHECK-NEXT:      %2, %3, %4 = scf.for %time = %time_m to %1 step %time_m iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (!stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>) {
 // CHECK-NEXT:        %u_t0_temp = stencil.load %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> -> !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:        "dmp.swap"(%u_t0_temp) {"topo" = #dmp.topo<2x1x1>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (!stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>) -> ()
+// CHECK-NEXT:        %5 = "dmp.swap"(%u_t0_temp) {"strategy" = #dmp.grid_slice_3d<#dmp.topo<2x1x1>, false>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (!stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>) -> !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
 // CHECK-NEXT:        %u_t2_temp = stencil.load %u_t2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> -> !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
-// CHECK-NEXT:        "dmp.swap"(%u_t2_temp) {"topo" = #dmp.topo<2x1x1>, "swaps" = []} : (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> ()
-// CHECK-NEXT:        %u_t1_temp = stencil.apply(%u_t0_blk = %u_t0_temp : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>, %u_t2_blk = %u_t2_temp : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) {
+// CHECK-NEXT:        %6 = "dmp.swap"(%u_t2_temp) {"strategy" = #dmp.grid_slice_3d<#dmp.topo<2x1x1>, false>, "swaps" = []} : (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
+// CHECK-NEXT:        %u_t1_temp = stencil.apply(%u_t0_blk = %5 : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>, %u_t2_blk = %6 : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) {
 // CHECK-NEXT:          %dt = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %6 = arith.constant 2 : i64
-// CHECK-NEXT:          %7 = "math.fpowi"(%dt, %6) : (f32, i64) -> f32
-// CHECK-NEXT:          %8 = arith.constant -1 : i64
-// CHECK-NEXT:          %dt_1 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %9 = arith.constant -2 : i64
-// CHECK-NEXT:          %10 = "math.fpowi"(%dt_1, %9) : (f32, i64) -> f32
-// CHECK-NEXT:          %11 = stencil.access %u_t2_blk[0, 0, 0] : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
-// CHECK-NEXT:          %12 = arith.mulf %10, %11 : f32
-// CHECK-NEXT:          %13 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %dt_2 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %14 = arith.constant -2 : i64
-// CHECK-NEXT:          %15 = "math.fpowi"(%dt_2, %14) : (f32, i64) -> f32
-// CHECK-NEXT:          %16 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %17 = arith.mulf %13, %15 : f32
-// CHECK-NEXT:          %18 = arith.mulf %17, %16 : f32
-// CHECK-NEXT:          %19 = arith.addf %12, %18 : f32
-// CHECK-NEXT:          %20 = arith.sitofp %8 : i64 to f32
-// CHECK-NEXT:          %21 = arith.mulf %20, %19 : f32
+// CHECK-NEXT:          %7 = arith.constant 2 : i64
+// CHECK-NEXT:          %8 = "math.fpowi"(%dt, %7) : (f32, i64) -> f32
+// CHECK-NEXT:          %9 = arith.constant -1 : i64
+// CHECK-NEXT:          %10 = arith.constant -2 : i64
+// CHECK-NEXT:          %11 = "math.fpowi"(%dt, %10) : (f32, i64) -> f32
+// CHECK-NEXT:          %12 = stencil.access %u_t2_blk[0, 0, 0] : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
+// CHECK-NEXT:          %13 = arith.mulf %11, %12 : f32
+// CHECK-NEXT:          %14 = arith.constant -2.000000e+00 : f32
+// CHECK-NEXT:          %15 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %16 = arith.mulf %14, %11 : f32
+// CHECK-NEXT:          %17 = arith.mulf %16, %15 : f32
+// CHECK-NEXT:          %18 = arith.addf %13, %17 : f32
+// CHECK-NEXT:          %19 = arith.sitofp %9 : i64 to f32
+// CHECK-NEXT:          %20 = arith.mulf %19, %18 : f32
 // CHECK-NEXT:          %h_x = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %22 = arith.constant -2 : i64
-// CHECK-NEXT:          %23 = "math.fpowi"(%h_x, %22) : (f32, i64) -> f32
-// CHECK-NEXT:          %24 = stencil.access %u_t0_blk[-1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %25 = arith.mulf %23, %24 : f32
-// CHECK-NEXT:          %h_x_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %26 = arith.constant -2 : i64
-// CHECK-NEXT:          %27 = "math.fpowi"(%h_x_1, %26) : (f32, i64) -> f32
-// CHECK-NEXT:          %28 = stencil.access %u_t0_blk[1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %29 = arith.mulf %27, %28 : f32
-// CHECK-NEXT:          %30 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_x_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %31 = arith.constant -2 : i64
-// CHECK-NEXT:          %32 = "math.fpowi"(%h_x_2, %31) : (f32, i64) -> f32
-// CHECK-NEXT:          %33 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %34 = arith.mulf %30, %32 : f32
-// CHECK-NEXT:          %35 = arith.mulf %34, %33 : f32
-// CHECK-NEXT:          %36 = arith.addf %25, %29 : f32
-// CHECK-NEXT:          %37 = arith.addf %36, %35 : f32
-// CHECK-NEXT:          %h_y = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %38 = arith.constant -2 : i64
-// CHECK-NEXT:          %39 = "math.fpowi"(%h_y, %38) : (f32, i64) -> f32
-// CHECK-NEXT:          %40 = stencil.access %u_t0_blk[0, -1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %41 = arith.mulf %39, %40 : f32
-// CHECK-NEXT:          %h_y_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %42 = arith.constant -2 : i64
-// CHECK-NEXT:          %43 = "math.fpowi"(%h_y_1, %42) : (f32, i64) -> f32
-// CHECK-NEXT:          %44 = stencil.access %u_t0_blk[0, 1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %45 = arith.mulf %43, %44 : f32
-// CHECK-NEXT:          %46 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_y_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %47 = arith.constant -2 : i64
-// CHECK-NEXT:          %48 = "math.fpowi"(%h_y_2, %47) : (f32, i64) -> f32
-// CHECK-NEXT:          %49 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %50 = arith.mulf %46, %48 : f32
-// CHECK-NEXT:          %51 = arith.mulf %50, %49 : f32
-// CHECK-NEXT:          %52 = arith.addf %41, %45 : f32
-// CHECK-NEXT:          %53 = arith.addf %52, %51 : f32
-// CHECK-NEXT:          %h_z = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %54 = arith.constant -2 : i64
-// CHECK-NEXT:          %55 = "math.fpowi"(%h_z, %54) : (f32, i64) -> f32
-// CHECK-NEXT:          %56 = stencil.access %u_t0_blk[0, 0, -1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %57 = arith.mulf %55, %56 : f32
-// CHECK-NEXT:          %h_z_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %58 = arith.constant -2 : i64
-// CHECK-NEXT:          %59 = "math.fpowi"(%h_z_1, %58) : (f32, i64) -> f32
-// CHECK-NEXT:          %60 = stencil.access %u_t0_blk[0, 0, 1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %61 = arith.mulf %59, %60 : f32
-// CHECK-NEXT:          %62 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_z_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %63 = arith.constant -2 : i64
-// CHECK-NEXT:          %64 = "math.fpowi"(%h_z_2, %63) : (f32, i64) -> f32
-// CHECK-NEXT:          %65 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %66 = arith.mulf %62, %64 : f32
-// CHECK-NEXT:          %67 = arith.mulf %66, %65 : f32
-// CHECK-NEXT:          %68 = arith.addf %57, %61 : f32
-// CHECK-NEXT:          %69 = arith.addf %68, %67 : f32
-// CHECK-NEXT:          %70 = arith.addf %21, %37 : f32
-// CHECK-NEXT:          %71 = arith.addf %70, %53 : f32
-// CHECK-NEXT:          %72 = arith.addf %71, %69 : f32
-// CHECK-NEXT:          %73 = arith.mulf %7, %72 : f32
-// CHECK-NEXT:          stencil.return %73 : f32
+// CHECK-NEXT:          %21 = "math.fpowi"(%h_x, %10) : (f32, i64) -> f32
+// CHECK-NEXT:          %22 = stencil.access %u_t0_blk[-1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %23 = arith.mulf %21, %22 : f32
+// CHECK-NEXT:          %24 = stencil.access %u_t0_blk[1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %25 = arith.mulf %21, %24 : f32
+// CHECK-NEXT:          %26 = arith.mulf %14, %21 : f32
+// CHECK-NEXT:          %27 = arith.mulf %26, %15 : f32
+// CHECK-NEXT:          %28 = arith.addf %23, %25 : f32
+// CHECK-NEXT:          %29 = arith.addf %28, %27 : f32
+// CHECK-NEXT:          %30 = stencil.access %u_t0_blk[0, -1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %31 = arith.mulf %21, %30 : f32
+// CHECK-NEXT:          %32 = stencil.access %u_t0_blk[0, 1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %33 = arith.mulf %21, %32 : f32
+// CHECK-NEXT:          %34 = arith.addf %31, %33 : f32
+// CHECK-NEXT:          %35 = arith.addf %34, %27 : f32
+// CHECK-NEXT:          %36 = stencil.access %u_t0_blk[0, 0, -1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %37 = arith.mulf %21, %36 : f32
+// CHECK-NEXT:          %38 = stencil.access %u_t0_blk[0, 0, 1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %39 = arith.mulf %21, %38 : f32
+// CHECK-NEXT:          %40 = arith.addf %37, %39 : f32
+// CHECK-NEXT:          %41 = arith.addf %40, %27 : f32
+// CHECK-NEXT:          %42 = arith.addf %20, %29 : f32
+// CHECK-NEXT:          %43 = arith.addf %42, %35 : f32
+// CHECK-NEXT:          %44 = arith.addf %43, %41 : f32
+// CHECK-NEXT:          %45 = arith.mulf %8, %44 : f32
+// CHECK-NEXT:          stencil.return %45 : f32
 // CHECK-NEXT:        }
-// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
+// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>) : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %6 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%6, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %7 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%7, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
 // CHECK-NEXT:    func.func private @timer_end(f64) -> f64
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT:  }
diff --git a/tests/filecheck/xdsl_mpi_pipeline_b.mlir b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
index 684e714266..aa4351c7c9 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_b.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_b.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
@@ -111,97 +111,56 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 1 : index
 // CHECK-NEXT:      %time_M = arith.constant 20 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4, %5 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (!stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>) {
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_m : index
+// CHECK-NEXT:      %2, %3, %4 = scf.for %time = %time_m to %1 step %time_m iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (!stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>) {
 // CHECK-NEXT:        %u_t0_temp = stencil.load %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> -> !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:        "dmp.swap"(%u_t0_temp) {"topo" = #dmp.topo<2x1x1>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (!stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>) -> ()
+// CHECK-NEXT:        %5 = "dmp.swap"(%u_t0_temp) {"strategy" = #dmp.grid_slice_3d<#dmp.topo<2x1x1>, false>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (!stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>) -> !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
 // CHECK-NEXT:        %u_t2_temp = stencil.load %u_t2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32> -> !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
-// CHECK-NEXT:        %u_t1_temp = stencil.apply(%u_t0_blk = %u_t0_temp : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>, %u_t2_blk = %u_t2_temp : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) {
+// CHECK-NEXT:        %u_t1_temp = stencil.apply(%u_t0_blk = %5 : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>, %u_t2_blk = %u_t2_temp : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>) -> (!stencil.temp<[0,51]x[0,101]x[0,101]xf32>) {
 // CHECK-NEXT:          %dt = arith.constant 1.000000e-04 : f32
 // CHECK-NEXT:          %6 = arith.constant 2 : i64
 // CHECK-NEXT:          %7 = "math.fpowi"(%dt, %6) : (f32, i64) -> f32
 // CHECK-NEXT:          %8 = arith.constant -1 : i64
-// CHECK-NEXT:          %dt_1 = arith.constant 1.000000e-04 : f32
 // CHECK-NEXT:          %9 = arith.constant -2 : i64
-// CHECK-NEXT:          %10 = "math.fpowi"(%dt_1, %9) : (f32, i64) -> f32
+// CHECK-NEXT:          %10 = "math.fpowi"(%dt, %9) : (f32, i64) -> f32
 // CHECK-NEXT:          %11 = stencil.access %u_t2_blk[0, 0, 0] : !stencil.temp<[0,51]x[0,101]x[0,101]xf32>
 // CHECK-NEXT:          %12 = arith.mulf %10, %11 : f32
 // CHECK-NEXT:          %13 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %dt_2 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %14 = arith.constant -2 : i64
-// CHECK-NEXT:          %15 = "math.fpowi"(%dt_2, %14) : (f32, i64) -> f32
-// CHECK-NEXT:          %16 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %17 = arith.mulf %13, %15 : f32
-// CHECK-NEXT:          %18 = arith.mulf %17, %16 : f32
-// CHECK-NEXT:          %19 = arith.addf %12, %18 : f32
-// CHECK-NEXT:          %20 = arith.sitofp %8 : i64 to f32
-// CHECK-NEXT:          %21 = arith.mulf %20, %19 : f32
+// CHECK-NEXT:          %14 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %15 = arith.mulf %13, %10 : f32
+// CHECK-NEXT:          %16 = arith.mulf %15, %14 : f32
+// CHECK-NEXT:          %17 = arith.addf %12, %16 : f32
+// CHECK-NEXT:          %18 = arith.sitofp %8 : i64 to f32
+// CHECK-NEXT:          %19 = arith.mulf %18, %17 : f32
 // CHECK-NEXT:          %h_x = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %22 = arith.constant -2 : i64
-// CHECK-NEXT:          %23 = "math.fpowi"(%h_x, %22) : (f32, i64) -> f32
-// CHECK-NEXT:          %24 = stencil.access %u_t0_blk[-1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %25 = arith.mulf %23, %24 : f32
-// CHECK-NEXT:          %h_x_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %26 = arith.constant -2 : i64
-// CHECK-NEXT:          %27 = "math.fpowi"(%h_x_1, %26) : (f32, i64) -> f32
-// CHECK-NEXT:          %28 = stencil.access %u_t0_blk[1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %29 = arith.mulf %27, %28 : f32
-// CHECK-NEXT:          %30 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_x_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %31 = arith.constant -2 : i64
-// CHECK-NEXT:          %32 = "math.fpowi"(%h_x_2, %31) : (f32, i64) -> f32
-// CHECK-NEXT:          %33 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %34 = arith.mulf %30, %32 : f32
-// CHECK-NEXT:          %35 = arith.mulf %34, %33 : f32
-// CHECK-NEXT:          %36 = arith.addf %25, %29 : f32
-// CHECK-NEXT:          %37 = arith.addf %36, %35 : f32
-// CHECK-NEXT:          %h_y = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %38 = arith.constant -2 : i64
-// CHECK-NEXT:          %39 = "math.fpowi"(%h_y, %38) : (f32, i64) -> f32
-// CHECK-NEXT:          %40 = stencil.access %u_t0_blk[0, -1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %41 = arith.mulf %39, %40 : f32
-// CHECK-NEXT:          %h_y_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %42 = arith.constant -2 : i64
-// CHECK-NEXT:          %43 = "math.fpowi"(%h_y_1, %42) : (f32, i64) -> f32
-// CHECK-NEXT:          %44 = stencil.access %u_t0_blk[0, 1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %45 = arith.mulf %43, %44 : f32
-// CHECK-NEXT:          %46 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_y_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %47 = arith.constant -2 : i64
-// CHECK-NEXT:          %48 = "math.fpowi"(%h_y_2, %47) : (f32, i64) -> f32
-// CHECK-NEXT:          %49 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %50 = arith.mulf %46, %48 : f32
-// CHECK-NEXT:          %51 = arith.mulf %50, %49 : f32
-// CHECK-NEXT:          %52 = arith.addf %41, %45 : f32
-// CHECK-NEXT:          %53 = arith.addf %52, %51 : f32
-// CHECK-NEXT:          %h_z = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %54 = arith.constant -2 : i64
-// CHECK-NEXT:          %55 = "math.fpowi"(%h_z, %54) : (f32, i64) -> f32
-// CHECK-NEXT:          %56 = stencil.access %u_t0_blk[0, 0, -1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %57 = arith.mulf %55, %56 : f32
-// CHECK-NEXT:          %h_z_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %58 = arith.constant -2 : i64
-// CHECK-NEXT:          %59 = "math.fpowi"(%h_z_1, %58) : (f32, i64) -> f32
-// CHECK-NEXT:          %60 = stencil.access %u_t0_blk[0, 0, 1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %61 = arith.mulf %59, %60 : f32
-// CHECK-NEXT:          %62 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_z_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %63 = arith.constant -2 : i64
-// CHECK-NEXT:          %64 = "math.fpowi"(%h_z_2, %63) : (f32, i64) -> f32
-// CHECK-NEXT:          %65 = stencil.access %u_t0_blk[0, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
-// CHECK-NEXT:          %66 = arith.mulf %62, %64 : f32
-// CHECK-NEXT:          %67 = arith.mulf %66, %65 : f32
-// CHECK-NEXT:          %68 = arith.addf %57, %61 : f32
-// CHECK-NEXT:          %69 = arith.addf %68, %67 : f32
-// CHECK-NEXT:          %70 = arith.addf %21, %37 : f32
-// CHECK-NEXT:          %71 = arith.addf %70, %53 : f32
-// CHECK-NEXT:          %72 = arith.addf %71, %69 : f32
-// CHECK-NEXT:          %73 = arith.mulf %7, %72 : f32
-// CHECK-NEXT:          stencil.return %73 : f32
+// CHECK-NEXT:          %20 = "math.fpowi"(%h_x, %9) : (f32, i64) -> f32
+// CHECK-NEXT:          %21 = stencil.access %u_t0_blk[-1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %22 = arith.mulf %20, %21 : f32
+// CHECK-NEXT:          %23 = stencil.access %u_t0_blk[1, 0, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %24 = arith.mulf %20, %23 : f32
+// CHECK-NEXT:          %25 = arith.mulf %13, %20 : f32
+// CHECK-NEXT:          %26 = arith.mulf %25, %14 : f32
+// CHECK-NEXT:          %27 = arith.addf %22, %24 : f32
+// CHECK-NEXT:          %28 = arith.addf %27, %26 : f32
+// CHECK-NEXT:          %29 = stencil.access %u_t0_blk[0, -1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %30 = arith.mulf %20, %29 : f32
+// CHECK-NEXT:          %31 = stencil.access %u_t0_blk[0, 1, 0] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %32 = arith.mulf %20, %31 : f32
+// CHECK-NEXT:          %33 = arith.addf %30, %32 : f32
+// CHECK-NEXT:          %34 = arith.addf %33, %26 : f32
+// CHECK-NEXT:          %35 = stencil.access %u_t0_blk[0, 0, -1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %36 = arith.mulf %20, %35 : f32
+// CHECK-NEXT:          %37 = stencil.access %u_t0_blk[0, 0, 1] : !stencil.temp<[-1,52]x[-1,102]x[-1,102]xf32>
+// CHECK-NEXT:          %38 = arith.mulf %20, %37 : f32
+// CHECK-NEXT:          %39 = arith.addf %36, %38 : f32
+// CHECK-NEXT:          %40 = arith.addf %39, %26 : f32
+// CHECK-NEXT:          %41 = arith.addf %19, %28 : f32
+// CHECK-NEXT:          %42 = arith.addf %41, %34 : f32
+// CHECK-NEXT:          %43 = arith.addf %42, %40 : f32
+// CHECK-NEXT:          %44 = arith.mulf %7, %43 : f32
+// CHECK-NEXT:          stencil.return %44 : f32
 // CHECK-NEXT:        }
-// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>)  : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
+// CHECK-NEXT:        stencil.store %u_t1_temp to %u_t1(<[0, 0, 0], [51, 101, 101]>) : !stencil.temp<[0,51]x[0,101]x[0,101]xf32> to !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>
 // CHECK-NEXT:      }
 // CHECK-NEXT:      %6 = func.call @timer_end(%0) : (f64) -> f64
@@ -210,4 +169,4 @@ builtin.module {
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
 // CHECK-NEXT:    func.func private @timer_end(f64) -> f64
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT:  }
diff --git a/tests/filecheck/xdsl_mpi_pipeline_c.mlir b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
index e94a7d2504..0ab85d4468 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_c.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_c.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
@@ -111,127 +111,322 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 1 : index
 // CHECK-NEXT:      %time_M = arith.constant 20 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4, %5 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
-// CHECK-NEXT:        %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 53, 103, 103>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        "dmp.swap"(%u_t0_loadview) {"topo" = #dmp.topo<2x1x1>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> ()
-// CHECK-NEXT:        %u_t2_loadview = "memref.subview"(%u_t2) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %6 = arith.constant 0 : index
-// CHECK-NEXT:        %7 = arith.constant 0 : index
-// CHECK-NEXT:        %8 = arith.constant 0 : index
-// CHECK-NEXT:        %9 = arith.constant 1 : index
-// CHECK-NEXT:        %10 = arith.constant 1 : index
-// CHECK-NEXT:        %11 = arith.constant 1 : index
-// CHECK-NEXT:        %12 = arith.constant 51 : index
-// CHECK-NEXT:        %13 = arith.constant 101 : index
-// CHECK-NEXT:        %14 = arith.constant 101 : index
-// CHECK-NEXT:        "scf.parallel"(%6, %7, %8, %12, %13, %14, %9, %10, %11) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
-// CHECK-NEXT:        ^0(%15 : index, %16 : index, %17 : index):
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_m : index
+// CHECK-NEXT:      %2 = arith.constant 12 : i32
+// CHECK-NEXT:      %3 = "mpi.allocate"(%2) {"dtype" = !mpi.request} : (i32) -> !mpi.vector<!mpi.request>
+// CHECK-NEXT:      %4 = "mpi.comm.rank"() : () -> i32
+// CHECK-NEXT:      %send_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %send_buff_ex0_ptr, %5, %6 = "mpi.unwrap_memref"(%send_buff_ex0) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %recv_buff_ex0_ptr, %7, %8 = "mpi.unwrap_memref"(%recv_buff_ex0) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %send_buff_ex1_ptr, %9, %10 = "mpi.unwrap_memref"(%send_buff_ex1) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %recv_buff_ex1_ptr, %11, %12 = "mpi.unwrap_memref"(%recv_buff_ex1) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex2_ptr, %13, %14 = "mpi.unwrap_memref"(%send_buff_ex2) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex2_ptr, %15, %16 = "mpi.unwrap_memref"(%recv_buff_ex2) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex3_ptr, %17, %18 = "mpi.unwrap_memref"(%send_buff_ex3) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex3_ptr, %19, %20 = "mpi.unwrap_memref"(%recv_buff_ex3) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex4_ptr, %21, %22 = "mpi.unwrap_memref"(%send_buff_ex4) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex4_ptr, %23, %24 = "mpi.unwrap_memref"(%recv_buff_ex4) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex5_ptr, %25, %26 = "mpi.unwrap_memref"(%send_buff_ex5) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex5_ptr, %27, %28 = "mpi.unwrap_memref"(%recv_buff_ex5) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %29, %30, %31 = scf.for %time = %time_m to %1 step %time_m iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
+// CHECK-NEXT:        %32 = arith.constant 0 : i32
+// CHECK-NEXT:        %33 = arith.constant 1 : i32
+// CHECK-NEXT:        %34 = arith.divui %4, %33 : i32
+// CHECK-NEXT:        %35 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %36 = arith.divui %35, %33 : i32
+// CHECK-NEXT:        %37 = arith.remui %35, %33 : i32
+// CHECK-NEXT:        %38 = arith.divui %37, %33 : i32
+// CHECK-NEXT:        %39 = arith.remui %37, %33 : i32
+// CHECK-NEXT:        %40 = arith.addi %34, %33 : i32
+// CHECK-NEXT:        %41 = arith.constant 2 : i32
+// CHECK-NEXT:        %42 = arith.cmpi slt, %40, %41 : i32
+// CHECK-NEXT:        %43 = arith.constant true
+// CHECK-NEXT:        %44 = arith.andi %42, %43 : i1
+// CHECK-NEXT:        %45 = arith.andi %44, %43 : i1
+// CHECK-NEXT:        %46 = arith.muli %33, %40 : i32
+// CHECK-NEXT:        %47 = arith.addi %38, %46 : i32
+// CHECK-NEXT:        %48 = arith.muli %33, %36 : i32
+// CHECK-NEXT:        %49 = arith.addi %47, %48 : i32
+// CHECK-NEXT:        %50 = arith.constant 6 : i32
+// CHECK-NEXT:        %51 = "mpi.vector_get"(%3, %32) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %52 = "mpi.vector_get"(%3, %50) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%45) ({
+// CHECK-NEXT:          %53 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %54 = memref.subview %53[52, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 573512>>
+// CHECK-NEXT:          "memref.copy"(%54, %send_buff_ex0) : (memref<101x101xf32, strided<[105, 1], offset: 573512>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex0_ptr, %5, %6, %49, %32, %51) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex0_ptr, %7, %8, %49, %32, %52) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%51) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%52) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %55 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %56 = arith.divui %55, %33 : i32
+// CHECK-NEXT:        %57 = arith.remui %55, %33 : i32
+// CHECK-NEXT:        %58 = arith.divui %57, %33 : i32
+// CHECK-NEXT:        %59 = arith.remui %57, %33 : i32
+// CHECK-NEXT:        %60 = arith.constant -1 : i32
+// CHECK-NEXT:        %61 = arith.addi %34, %60 : i32
+// CHECK-NEXT:        %62 = arith.cmpi sge, %61, %32 : i32
+// CHECK-NEXT:        %63 = arith.andi %62, %43 : i1
+// CHECK-NEXT:        %64 = arith.andi %63, %43 : i1
+// CHECK-NEXT:        %65 = arith.muli %33, %61 : i32
+// CHECK-NEXT:        %66 = arith.addi %58, %65 : i32
+// CHECK-NEXT:        %67 = arith.muli %33, %56 : i32
+// CHECK-NEXT:        %68 = arith.addi %66, %67 : i32
+// CHECK-NEXT:        %69 = arith.constant 7 : i32
+// CHECK-NEXT:        %70 = "mpi.vector_get"(%3, %33) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %71 = "mpi.vector_get"(%3, %69) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%64) ({
+// CHECK-NEXT:          %72 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %73 = memref.subview %72[2, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%73, %send_buff_ex1) : (memref<101x101xf32, strided<[105, 1], offset: 22262>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex1_ptr, %9, %10, %68, %32, %70) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex1_ptr, %11, %12, %68, %32, %71) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%70) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%71) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %74 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %75 = arith.divui %74, %33 : i32
+// CHECK-NEXT:        %76 = arith.remui %74, %33 : i32
+// CHECK-NEXT:        %77 = arith.divui %76, %33 : i32
+// CHECK-NEXT:        %78 = arith.remui %76, %33 : i32
+// CHECK-NEXT:        %79 = arith.addi %75, %33 : i32
+// CHECK-NEXT:        %80 = arith.cmpi slt, %79, %33 : i32
+// CHECK-NEXT:        %81 = arith.andi %43, %80 : i1
+// CHECK-NEXT:        %82 = arith.andi %81, %43 : i1
+// CHECK-NEXT:        %83 = arith.muli %33, %34 : i32
+// CHECK-NEXT:        %84 = arith.addi %77, %83 : i32
+// CHECK-NEXT:        %85 = arith.muli %33, %79 : i32
+// CHECK-NEXT:        %86 = arith.addi %84, %85 : i32
+// CHECK-NEXT:        %87 = arith.constant 8 : i32
+// CHECK-NEXT:        %88 = "mpi.vector_get"(%3, %41) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %89 = "mpi.vector_get"(%3, %87) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%82) ({
+// CHECK-NEXT:          %90 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %91 = memref.subview %90[2, 102, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32762>>
+// CHECK-NEXT:          "memref.copy"(%91, %send_buff_ex2) : (memref<51x101xf32, strided<[11025, 1], offset: 32762>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex2_ptr, %13, %14, %86, %32, %88) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex2_ptr, %15, %16, %86, %32, %89) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%88) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%89) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %92 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %93 = arith.divui %92, %33 : i32
+// CHECK-NEXT:        %94 = arith.remui %92, %33 : i32
+// CHECK-NEXT:        %95 = arith.divui %94, %33 : i32
+// CHECK-NEXT:        %96 = arith.remui %94, %33 : i32
+// CHECK-NEXT:        %97 = arith.addi %93, %60 : i32
+// CHECK-NEXT:        %98 = arith.cmpi sge, %97, %32 : i32
+// CHECK-NEXT:        %99 = arith.andi %43, %98 : i1
+// CHECK-NEXT:        %100 = arith.andi %99, %43 : i1
+// CHECK-NEXT:        %101 = arith.addi %95, %83 : i32
+// CHECK-NEXT:        %102 = arith.muli %33, %97 : i32
+// CHECK-NEXT:        %103 = arith.addi %101, %102 : i32
+// CHECK-NEXT:        %104 = arith.constant 3 : i32
+// CHECK-NEXT:        %105 = arith.constant 9 : i32
+// CHECK-NEXT:        %106 = "mpi.vector_get"(%3, %104) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %107 = "mpi.vector_get"(%3, %105) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%100) ({
+// CHECK-NEXT:          %108 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %109 = memref.subview %108[2, 2, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%109, %send_buff_ex3) : (memref<51x101xf32, strided<[11025, 1], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex3_ptr, %17, %18, %103, %32, %106) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex3_ptr, %19, %20, %103, %32, %107) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%106) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%107) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %110 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %111 = arith.divui %110, %33 : i32
+// CHECK-NEXT:        %112 = arith.remui %110, %33 : i32
+// CHECK-NEXT:        %113 = arith.divui %112, %33 : i32
+// CHECK-NEXT:        %114 = arith.remui %112, %33 : i32
+// CHECK-NEXT:        %115 = arith.addi %113, %33 : i32
+// CHECK-NEXT:        %116 = arith.cmpi slt, %115, %33 : i32
+// CHECK-NEXT:        %117 = arith.andi %43, %43 : i1
+// CHECK-NEXT:        %118 = arith.andi %117, %116 : i1
+// CHECK-NEXT:        %119 = arith.addi %115, %83 : i32
+// CHECK-NEXT:        %120 = arith.muli %33, %111 : i32
+// CHECK-NEXT:        %121 = arith.addi %119, %120 : i32
+// CHECK-NEXT:        %122 = arith.constant 4 : i32
+// CHECK-NEXT:        %123 = arith.constant 10 : i32
+// CHECK-NEXT:        %124 = "mpi.vector_get"(%3, %122) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %125 = "mpi.vector_get"(%3, %123) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%118) ({
+// CHECK-NEXT:          %126 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %127 = memref.subview %126[2, 2, 102] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22362>>
+// CHECK-NEXT:          "memref.copy"(%127, %send_buff_ex4) : (memref<51x101xf32, strided<[11025, 105], offset: 22362>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex4_ptr, %21, %22, %121, %32, %124) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex4_ptr, %23, %24, %121, %32, %125) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%124) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%125) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %128 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %129 = arith.divui %128, %33 : i32
+// CHECK-NEXT:        %130 = arith.remui %128, %33 : i32
+// CHECK-NEXT:        %131 = arith.divui %130, %33 : i32
+// CHECK-NEXT:        %132 = arith.remui %130, %33 : i32
+// CHECK-NEXT:        %133 = arith.addi %131, %60 : i32
+// CHECK-NEXT:        %134 = arith.cmpi sge, %133, %32 : i32
+// CHECK-NEXT:        %135 = arith.andi %117, %134 : i1
+// CHECK-NEXT:        %136 = arith.addi %133, %83 : i32
+// CHECK-NEXT:        %137 = arith.muli %33, %129 : i32
+// CHECK-NEXT:        %138 = arith.addi %136, %137 : i32
+// CHECK-NEXT:        %139 = arith.constant 5 : i32
+// CHECK-NEXT:        %140 = arith.constant 11 : i32
+// CHECK-NEXT:        %141 = "mpi.vector_get"(%3, %139) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %142 = "mpi.vector_get"(%3, %140) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%135) ({
+// CHECK-NEXT:          %143 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %144 = memref.subview %143[2, 2, 2] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%144, %send_buff_ex5) : (memref<51x101xf32, strided<[11025, 105], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex5_ptr, %25, %26, %138, %32, %141) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex5_ptr, %27, %28, %138, %32, %142) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%141) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%142) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "mpi.waitall"(%3, %2) : (!mpi.vector<!mpi.request>, i32) -> ()
+// CHECK-NEXT:        "scf.if"(%45) ({
+// CHECK-NEXT:          %145 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %146 = memref.subview %145[53, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 584537>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex0, %146) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 584537>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%64) ({
+// CHECK-NEXT:          %147 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %148 = memref.subview %147[1, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 11237>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex1, %148) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 11237>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%82) ({
+// CHECK-NEXT:          %149 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %150 = memref.subview %149[2, 103, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32867>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex2, %150) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 32867>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%100) ({
+// CHECK-NEXT:          %151 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %152 = memref.subview %151[2, 1, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22157>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex3, %152) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 22157>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%118) ({
+// CHECK-NEXT:          %153 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %154 = memref.subview %153[2, 2, 103] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22363>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex4, %154) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22363>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%135) ({
+// CHECK-NEXT:          %155 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %156 = memref.subview %155[2, 2, 1] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22261>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex5, %156) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22261>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %157 = memref.subview %u_t1[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t0_blk = memref.subview %u_t0[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t2_blk = memref.subview %u_t2[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %158 = arith.constant 0 : index
+// CHECK-NEXT:        %159 = arith.constant 51 : index
+// CHECK-NEXT:        %160 = arith.constant 101 : index
+// CHECK-NEXT:        "scf.parallel"(%158, %158, %158, %159, %160, %160, %time_m, %time_m, %time_m) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
+// CHECK-NEXT:        ^0(%161 : index, %162 : index, %163 : index):
 // CHECK-NEXT:          %dt = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %18 = arith.constant 2 : i64
-// CHECK-NEXT:          %19 = "math.fpowi"(%dt, %18) : (f32, i64) -> f32
-// CHECK-NEXT:          %20 = arith.constant -1 : i64
-// CHECK-NEXT:          %dt_1 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %21 = arith.constant -2 : i64
-// CHECK-NEXT:          %22 = "math.fpowi"(%dt_1, %21) : (f32, i64) -> f32
-// CHECK-NEXT:          %23 = memref.load %u_t2_loadview[%15, %16, %17] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %24 = arith.mulf %22, %23 : f32
-// CHECK-NEXT:          %25 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %dt_2 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:          %26 = arith.constant -2 : i64
-// CHECK-NEXT:          %27 = "math.fpowi"(%dt_2, %26) : (f32, i64) -> f32
-// CHECK-NEXT:          %28 = memref.load %u_t0_loadview[%15, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %29 = arith.mulf %25, %27 : f32
-// CHECK-NEXT:          %30 = arith.mulf %29, %28 : f32
-// CHECK-NEXT:          %31 = arith.addf %24, %30 : f32
-// CHECK-NEXT:          %32 = arith.sitofp %20 : i64 to f32
-// CHECK-NEXT:          %33 = arith.mulf %32, %31 : f32
+// CHECK-NEXT:          %164 = arith.constant 2 : i64
+// CHECK-NEXT:          %165 = "math.fpowi"(%dt, %164) : (f32, i64) -> f32
+// CHECK-NEXT:          %166 = arith.constant -1 : i64
+// CHECK-NEXT:          %167 = arith.constant -2 : i64
+// CHECK-NEXT:          %168 = "math.fpowi"(%dt, %167) : (f32, i64) -> f32
+// CHECK-NEXT:          %169 = memref.load %u_t2_blk[%161, %162, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %170 = arith.mulf %168, %169 : f32
+// CHECK-NEXT:          %171 = arith.constant -2.000000e+00 : f32
+// CHECK-NEXT:          %172 = memref.load %u_t0_blk[%161, %162, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %173 = arith.mulf %171, %168 : f32
+// CHECK-NEXT:          %174 = arith.mulf %173, %172 : f32
+// CHECK-NEXT:          %175 = arith.addf %170, %174 : f32
+// CHECK-NEXT:          %176 = arith.sitofp %166 : i64 to f32
+// CHECK-NEXT:          %177 = arith.mulf %176, %175 : f32
 // CHECK-NEXT:          %h_x = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %34 = arith.constant -2 : i64
-// CHECK-NEXT:          %35 = "math.fpowi"(%h_x, %34) : (f32, i64) -> f32
-// CHECK-NEXT:          %36 = arith.constant -1 : index
-// CHECK-NEXT:          %37 = arith.addi %15, %36 : index
-// CHECK-NEXT:          %38 = memref.load %u_t0_loadview[%37, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %39 = arith.mulf %35, %38 : f32
-// CHECK-NEXT:          %h_x_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %40 = arith.constant -2 : i64
-// CHECK-NEXT:          %41 = "math.fpowi"(%h_x_1, %40) : (f32, i64) -> f32
-// CHECK-NEXT:          %42 = arith.constant 1 : index
-// CHECK-NEXT:          %43 = arith.addi %15, %42 : index
-// CHECK-NEXT:          %44 = memref.load %u_t0_loadview[%43, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %45 = arith.mulf %41, %44 : f32
-// CHECK-NEXT:          %46 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_x_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %47 = arith.constant -2 : i64
-// CHECK-NEXT:          %48 = "math.fpowi"(%h_x_2, %47) : (f32, i64) -> f32
-// CHECK-NEXT:          %49 = memref.load %u_t0_loadview[%15, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %50 = arith.mulf %46, %48 : f32
-// CHECK-NEXT:          %51 = arith.mulf %50, %49 : f32
-// CHECK-NEXT:          %52 = arith.addf %39, %45 : f32
-// CHECK-NEXT:          %53 = arith.addf %52, %51 : f32
-// CHECK-NEXT:          %h_y = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %54 = arith.constant -2 : i64
-// CHECK-NEXT:          %55 = "math.fpowi"(%h_y, %54) : (f32, i64) -> f32
-// CHECK-NEXT:          %56 = arith.constant -1 : index
-// CHECK-NEXT:          %57 = arith.addi %16, %56 : index
-// CHECK-NEXT:          %58 = memref.load %u_t0_loadview[%15, %57, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %59 = arith.mulf %55, %58 : f32
-// CHECK-NEXT:          %h_y_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %60 = arith.constant -2 : i64
-// CHECK-NEXT:          %61 = "math.fpowi"(%h_y_1, %60) : (f32, i64) -> f32
-// CHECK-NEXT:          %62 = arith.constant 1 : index
-// CHECK-NEXT:          %63 = arith.addi %16, %62 : index
-// CHECK-NEXT:          %64 = memref.load %u_t0_loadview[%15, %63, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %65 = arith.mulf %61, %64 : f32
-// CHECK-NEXT:          %66 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_y_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %67 = arith.constant -2 : i64
-// CHECK-NEXT:          %68 = "math.fpowi"(%h_y_2, %67) : (f32, i64) -> f32
-// CHECK-NEXT:          %69 = memref.load %u_t0_loadview[%15, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %70 = arith.mulf %66, %68 : f32
-// CHECK-NEXT:          %71 = arith.mulf %70, %69 : f32
-// CHECK-NEXT:          %72 = arith.addf %59, %65 : f32
-// CHECK-NEXT:          %73 = arith.addf %72, %71 : f32
-// CHECK-NEXT:          %h_z = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %74 = arith.constant -2 : i64
-// CHECK-NEXT:          %75 = "math.fpowi"(%h_z, %74) : (f32, i64) -> f32
-// CHECK-NEXT:          %76 = arith.constant -1 : index
-// CHECK-NEXT:          %77 = arith.addi %17, %76 : index
-// CHECK-NEXT:          %78 = memref.load %u_t0_loadview[%15, %16, %77] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %79 = arith.mulf %75, %78 : f32
-// CHECK-NEXT:          %h_z_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %80 = arith.constant -2 : i64
-// CHECK-NEXT:          %81 = "math.fpowi"(%h_z_1, %80) : (f32, i64) -> f32
-// CHECK-NEXT:          %82 = arith.constant 1 : index
-// CHECK-NEXT:          %83 = arith.addi %17, %82 : index
-// CHECK-NEXT:          %84 = memref.load %u_t0_loadview[%15, %16, %83] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %85 = arith.mulf %81, %84 : f32
-// CHECK-NEXT:          %86 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:          %h_z_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:          %87 = arith.constant -2 : i64
-// CHECK-NEXT:          %88 = "math.fpowi"(%h_z_2, %87) : (f32, i64) -> f32
-// CHECK-NEXT:          %89 = memref.load %u_t0_loadview[%15, %16, %17] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:          %90 = arith.mulf %86, %88 : f32
-// CHECK-NEXT:          %91 = arith.mulf %90, %89 : f32
-// CHECK-NEXT:          %92 = arith.addf %79, %85 : f32
-// CHECK-NEXT:          %93 = arith.addf %92, %91 : f32
-// CHECK-NEXT:          %94 = arith.addf %33, %53 : f32
-// CHECK-NEXT:          %95 = arith.addf %94, %73 : f32
-// CHECK-NEXT:          %96 = arith.addf %95, %93 : f32
-// CHECK-NEXT:          %97 = arith.mulf %19, %96 : f32
-// CHECK-NEXT:          memref.store %97, %u_t1_storeview[%15, %16, %17] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %178 = "math.fpowi"(%h_x, %167) : (f32, i64) -> f32
+// CHECK-NEXT:          %179 = arith.constant -1 : index
+// CHECK-NEXT:          %180 = arith.addi %161, %179 : index
+// CHECK-NEXT:          %181 = memref.load %u_t0_blk[%180, %162, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %182 = arith.mulf %178, %181 : f32
+// CHECK-NEXT:          %183 = arith.addi %161, %time_m : index
+// CHECK-NEXT:          %184 = memref.load %u_t0_blk[%183, %162, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %185 = arith.mulf %178, %184 : f32
+// CHECK-NEXT:          %186 = arith.mulf %171, %178 : f32
+// CHECK-NEXT:          %187 = arith.mulf %186, %172 : f32
+// CHECK-NEXT:          %188 = arith.addf %182, %185 : f32
+// CHECK-NEXT:          %189 = arith.addf %188, %187 : f32
+// CHECK-NEXT:          %190 = arith.addi %162, %179 : index
+// CHECK-NEXT:          %191 = memref.load %u_t0_blk[%161, %190, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %192 = arith.mulf %178, %191 : f32
+// CHECK-NEXT:          %193 = arith.addi %162, %time_m : index
+// CHECK-NEXT:          %194 = memref.load %u_t0_blk[%161, %193, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %195 = arith.mulf %178, %194 : f32
+// CHECK-NEXT:          %196 = arith.addf %192, %195 : f32
+// CHECK-NEXT:          %197 = arith.addf %196, %187 : f32
+// CHECK-NEXT:          %198 = arith.addi %163, %179 : index
+// CHECK-NEXT:          %199 = memref.load %u_t0_blk[%161, %162, %198] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %200 = arith.mulf %178, %199 : f32
+// CHECK-NEXT:          %201 = arith.addi %163, %time_m : index
+// CHECK-NEXT:          %202 = memref.load %u_t0_blk[%161, %162, %201] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:          %203 = arith.mulf %178, %202 : f32
+// CHECK-NEXT:          %204 = arith.addf %200, %203 : f32
+// CHECK-NEXT:          %205 = arith.addf %204, %187 : f32
+// CHECK-NEXT:          %206 = arith.addf %177, %189 : f32
+// CHECK-NEXT:          %207 = arith.addf %206, %197 : f32
+// CHECK-NEXT:          %208 = arith.addf %207, %205 : f32
+// CHECK-NEXT:          %209 = arith.mulf %165, %208 : f32
+// CHECK-NEXT:          memref.store %209, %157[%161, %162, %163] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (index, index, index, index, index, index, index, index, index) -> ()
-// CHECK-NEXT:        %u_t1_temp = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %98 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%98, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %210 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%210, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
 // CHECK-NEXT:    func.func private @timer_end(f64) -> f64
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT:  }
diff --git a/tests/filecheck/xdsl_mpi_pipeline_d.mlir b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
index 24460a858d..d086da064e 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_d.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_d.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0}" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0},canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
@@ -111,140 +111,332 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 1 : index
 // CHECK-NEXT:      %time_M = arith.constant 20 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4, %5 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
-// CHECK-NEXT:        %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 53, 103, 103>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        "dmp.swap"(%u_t0_loadview) {"topo" = #dmp.topo<2x1x1>, "swaps" = [#dmp.exchange<at [51, 0, 0] size [1, 101, 101] source offset [-1, 0, 0] to [1, 0, 0]>, #dmp.exchange<at [-1, 0, 0] size [1, 101, 101] source offset [1, 0, 0] to [-1, 0, 0]>, #dmp.exchange<at [0, 101, 0] size [51, 1, 101] source offset [0, -1, 0] to [0, 1, 0]>, #dmp.exchange<at [0, -1, 0] size [51, 1, 101] source offset [0, 1, 0] to [0, -1, 0]>, #dmp.exchange<at [0, 0, 101] size [51, 101, 1] source offset [0, 0, -1] to [0, 0, 1]>, #dmp.exchange<at [0, 0, -1] size [51, 101, 1] source offset [0, 0, 1] to [0, 0, -1]>]} : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> ()
-// CHECK-NEXT:        %u_t2_loadview = "memref.subview"(%u_t2) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %6 = arith.constant 0 : index
-// CHECK-NEXT:        %7 = arith.constant 0 : index
-// CHECK-NEXT:        %8 = arith.constant 0 : index
-// CHECK-NEXT:        %9 = arith.constant 1 : index
-// CHECK-NEXT:        %10 = arith.constant 1 : index
-// CHECK-NEXT:        %11 = arith.constant 1 : index
-// CHECK-NEXT:        %12 = arith.constant 51 : index
-// CHECK-NEXT:        %13 = arith.constant 101 : index
-// CHECK-NEXT:        %14 = arith.constant 101 : index
-// CHECK-NEXT:        %15 = arith.constant 0 : index
-// CHECK-NEXT:        %16 = arith.constant 64 : index
-// CHECK-NEXT:        %17 = arith.constant 64 : index
-// CHECK-NEXT:        %18 = arith.muli %9, %16 : index
-// CHECK-NEXT:        %19 = arith.muli %10, %17 : index
-// CHECK-NEXT:        "scf.parallel"(%6, %7, %12, %13, %18, %19) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
-// CHECK-NEXT:        ^0(%20 : index, %21 : index):
-// CHECK-NEXT:          %22 = "affine.min"(%16, %12, %20) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          %23 = "affine.min"(%17, %13, %21) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          "scf.parallel"(%15, %15, %8, %22, %23, %14, %9, %10, %11) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
-// CHECK-NEXT:          ^1(%24 : index, %25 : index, %26 : index):
-// CHECK-NEXT:            %27 = arith.addi %20, %24 : index
-// CHECK-NEXT:            %28 = arith.addi %21, %25 : index
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_m : index
+// CHECK-NEXT:      %2 = arith.constant 12 : i32
+// CHECK-NEXT:      %3 = "mpi.allocate"(%2) {"dtype" = !mpi.request} : (i32) -> !mpi.vector<!mpi.request>
+// CHECK-NEXT:      %4 = "mpi.comm.rank"() : () -> i32
+// CHECK-NEXT:      %send_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %send_buff_ex0_ptr, %5, %6 = "mpi.unwrap_memref"(%send_buff_ex0) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %recv_buff_ex0_ptr, %7, %8 = "mpi.unwrap_memref"(%recv_buff_ex0) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %send_buff_ex1_ptr, %9, %10 = "mpi.unwrap_memref"(%send_buff_ex1) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
+// CHECK-NEXT:      %recv_buff_ex1_ptr, %11, %12 = "mpi.unwrap_memref"(%recv_buff_ex1) : (memref<101x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex2_ptr, %13, %14 = "mpi.unwrap_memref"(%send_buff_ex2) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex2_ptr, %15, %16 = "mpi.unwrap_memref"(%recv_buff_ex2) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex3_ptr, %17, %18 = "mpi.unwrap_memref"(%send_buff_ex3) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex3_ptr, %19, %20 = "mpi.unwrap_memref"(%recv_buff_ex3) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex4_ptr, %21, %22 = "mpi.unwrap_memref"(%send_buff_ex4) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex4_ptr, %23, %24 = "mpi.unwrap_memref"(%recv_buff_ex4) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %send_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %send_buff_ex5_ptr, %25, %26 = "mpi.unwrap_memref"(%send_buff_ex5) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %recv_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
+// CHECK-NEXT:      %recv_buff_ex5_ptr, %27, %28 = "mpi.unwrap_memref"(%recv_buff_ex5) : (memref<51x101xf32>) -> (!llvm.ptr, i32, !mpi.datatype)
+// CHECK-NEXT:      %29, %30, %31 = scf.for %time = %time_m to %1 step %time_m iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
+// CHECK-NEXT:        %32 = arith.constant 0 : i32
+// CHECK-NEXT:        %33 = arith.constant 1 : i32
+// CHECK-NEXT:        %34 = arith.divui %4, %33 : i32
+// CHECK-NEXT:        %35 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %36 = arith.divui %35, %33 : i32
+// CHECK-NEXT:        %37 = arith.remui %35, %33 : i32
+// CHECK-NEXT:        %38 = arith.divui %37, %33 : i32
+// CHECK-NEXT:        %39 = arith.remui %37, %33 : i32
+// CHECK-NEXT:        %40 = arith.addi %34, %33 : i32
+// CHECK-NEXT:        %41 = arith.constant 2 : i32
+// CHECK-NEXT:        %42 = arith.cmpi slt, %40, %41 : i32
+// CHECK-NEXT:        %43 = arith.constant true
+// CHECK-NEXT:        %44 = arith.andi %42, %43 : i1
+// CHECK-NEXT:        %45 = arith.andi %44, %43 : i1
+// CHECK-NEXT:        %46 = arith.muli %33, %40 : i32
+// CHECK-NEXT:        %47 = arith.addi %38, %46 : i32
+// CHECK-NEXT:        %48 = arith.muli %33, %36 : i32
+// CHECK-NEXT:        %49 = arith.addi %47, %48 : i32
+// CHECK-NEXT:        %50 = arith.constant 6 : i32
+// CHECK-NEXT:        %51 = "mpi.vector_get"(%3, %32) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %52 = "mpi.vector_get"(%3, %50) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%45) ({
+// CHECK-NEXT:          %53 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %54 = memref.subview %53[52, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 573512>>
+// CHECK-NEXT:          "memref.copy"(%54, %send_buff_ex0) : (memref<101x101xf32, strided<[105, 1], offset: 573512>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex0_ptr, %5, %6, %49, %32, %51) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex0_ptr, %7, %8, %49, %32, %52) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%51) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%52) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %55 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %56 = arith.divui %55, %33 : i32
+// CHECK-NEXT:        %57 = arith.remui %55, %33 : i32
+// CHECK-NEXT:        %58 = arith.divui %57, %33 : i32
+// CHECK-NEXT:        %59 = arith.remui %57, %33 : i32
+// CHECK-NEXT:        %60 = arith.constant -1 : i32
+// CHECK-NEXT:        %61 = arith.addi %34, %60 : i32
+// CHECK-NEXT:        %62 = arith.cmpi sge, %61, %32 : i32
+// CHECK-NEXT:        %63 = arith.andi %62, %43 : i1
+// CHECK-NEXT:        %64 = arith.andi %63, %43 : i1
+// CHECK-NEXT:        %65 = arith.muli %33, %61 : i32
+// CHECK-NEXT:        %66 = arith.addi %58, %65 : i32
+// CHECK-NEXT:        %67 = arith.muli %33, %56 : i32
+// CHECK-NEXT:        %68 = arith.addi %66, %67 : i32
+// CHECK-NEXT:        %69 = arith.constant 7 : i32
+// CHECK-NEXT:        %70 = "mpi.vector_get"(%3, %33) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %71 = "mpi.vector_get"(%3, %69) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%64) ({
+// CHECK-NEXT:          %72 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %73 = memref.subview %72[2, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%73, %send_buff_ex1) : (memref<101x101xf32, strided<[105, 1], offset: 22262>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex1_ptr, %9, %10, %68, %32, %70) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex1_ptr, %11, %12, %68, %32, %71) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%70) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%71) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %74 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %75 = arith.divui %74, %33 : i32
+// CHECK-NEXT:        %76 = arith.remui %74, %33 : i32
+// CHECK-NEXT:        %77 = arith.divui %76, %33 : i32
+// CHECK-NEXT:        %78 = arith.remui %76, %33 : i32
+// CHECK-NEXT:        %79 = arith.addi %75, %33 : i32
+// CHECK-NEXT:        %80 = arith.cmpi slt, %79, %33 : i32
+// CHECK-NEXT:        %81 = arith.andi %43, %80 : i1
+// CHECK-NEXT:        %82 = arith.andi %81, %43 : i1
+// CHECK-NEXT:        %83 = arith.muli %33, %34 : i32
+// CHECK-NEXT:        %84 = arith.addi %77, %83 : i32
+// CHECK-NEXT:        %85 = arith.muli %33, %79 : i32
+// CHECK-NEXT:        %86 = arith.addi %84, %85 : i32
+// CHECK-NEXT:        %87 = arith.constant 8 : i32
+// CHECK-NEXT:        %88 = "mpi.vector_get"(%3, %41) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %89 = "mpi.vector_get"(%3, %87) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%82) ({
+// CHECK-NEXT:          %90 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %91 = memref.subview %90[2, 102, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32762>>
+// CHECK-NEXT:          "memref.copy"(%91, %send_buff_ex2) : (memref<51x101xf32, strided<[11025, 1], offset: 32762>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex2_ptr, %13, %14, %86, %32, %88) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex2_ptr, %15, %16, %86, %32, %89) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%88) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%89) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %92 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %93 = arith.divui %92, %33 : i32
+// CHECK-NEXT:        %94 = arith.remui %92, %33 : i32
+// CHECK-NEXT:        %95 = arith.divui %94, %33 : i32
+// CHECK-NEXT:        %96 = arith.remui %94, %33 : i32
+// CHECK-NEXT:        %97 = arith.addi %93, %60 : i32
+// CHECK-NEXT:        %98 = arith.cmpi sge, %97, %32 : i32
+// CHECK-NEXT:        %99 = arith.andi %43, %98 : i1
+// CHECK-NEXT:        %100 = arith.andi %99, %43 : i1
+// CHECK-NEXT:        %101 = arith.addi %95, %83 : i32
+// CHECK-NEXT:        %102 = arith.muli %33, %97 : i32
+// CHECK-NEXT:        %103 = arith.addi %101, %102 : i32
+// CHECK-NEXT:        %104 = arith.constant 3 : i32
+// CHECK-NEXT:        %105 = arith.constant 9 : i32
+// CHECK-NEXT:        %106 = "mpi.vector_get"(%3, %104) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %107 = "mpi.vector_get"(%3, %105) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%100) ({
+// CHECK-NEXT:          %108 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %109 = memref.subview %108[2, 2, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%109, %send_buff_ex3) : (memref<51x101xf32, strided<[11025, 1], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex3_ptr, %17, %18, %103, %32, %106) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex3_ptr, %19, %20, %103, %32, %107) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%106) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%107) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %110 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %111 = arith.divui %110, %33 : i32
+// CHECK-NEXT:        %112 = arith.remui %110, %33 : i32
+// CHECK-NEXT:        %113 = arith.divui %112, %33 : i32
+// CHECK-NEXT:        %114 = arith.remui %112, %33 : i32
+// CHECK-NEXT:        %115 = arith.addi %113, %33 : i32
+// CHECK-NEXT:        %116 = arith.cmpi slt, %115, %33 : i32
+// CHECK-NEXT:        %117 = arith.andi %43, %43 : i1
+// CHECK-NEXT:        %118 = arith.andi %117, %116 : i1
+// CHECK-NEXT:        %119 = arith.addi %115, %83 : i32
+// CHECK-NEXT:        %120 = arith.muli %33, %111 : i32
+// CHECK-NEXT:        %121 = arith.addi %119, %120 : i32
+// CHECK-NEXT:        %122 = arith.constant 4 : i32
+// CHECK-NEXT:        %123 = arith.constant 10 : i32
+// CHECK-NEXT:        %124 = "mpi.vector_get"(%3, %122) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %125 = "mpi.vector_get"(%3, %123) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%118) ({
+// CHECK-NEXT:          %126 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %127 = memref.subview %126[2, 2, 102] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22362>>
+// CHECK-NEXT:          "memref.copy"(%127, %send_buff_ex4) : (memref<51x101xf32, strided<[11025, 105], offset: 22362>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex4_ptr, %21, %22, %121, %32, %124) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex4_ptr, %23, %24, %121, %32, %125) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%124) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%125) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %128 = arith.remui %4, %33 : i32
+// CHECK-NEXT:        %129 = arith.divui %128, %33 : i32
+// CHECK-NEXT:        %130 = arith.remui %128, %33 : i32
+// CHECK-NEXT:        %131 = arith.divui %130, %33 : i32
+// CHECK-NEXT:        %132 = arith.remui %130, %33 : i32
+// CHECK-NEXT:        %133 = arith.addi %131, %60 : i32
+// CHECK-NEXT:        %134 = arith.cmpi sge, %133, %32 : i32
+// CHECK-NEXT:        %135 = arith.andi %117, %134 : i1
+// CHECK-NEXT:        %136 = arith.addi %133, %83 : i32
+// CHECK-NEXT:        %137 = arith.muli %33, %129 : i32
+// CHECK-NEXT:        %138 = arith.addi %136, %137 : i32
+// CHECK-NEXT:        %139 = arith.constant 5 : i32
+// CHECK-NEXT:        %140 = arith.constant 11 : i32
+// CHECK-NEXT:        %141 = "mpi.vector_get"(%3, %139) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        %142 = "mpi.vector_get"(%3, %140) : (!mpi.vector<!mpi.request>, i32) -> !mpi.request
+// CHECK-NEXT:        "scf.if"(%135) ({
+// CHECK-NEXT:          %143 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %144 = memref.subview %143[2, 2, 2] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%144, %send_buff_ex5) : (memref<51x101xf32, strided<[11025, 105], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          "mpi.isend"(%send_buff_ex5_ptr, %25, %26, %138, %32, %141) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          "mpi.irecv"(%recv_buff_ex5_ptr, %27, %28, %138, %32, %142) : (!llvm.ptr, i32, !mpi.datatype, i32, i32, !mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          "mpi.request_null"(%141) : (!mpi.request) -> ()
+// CHECK-NEXT:          "mpi.request_null"(%142) : (!mpi.request) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "mpi.waitall"(%3, %2) : (!mpi.vector<!mpi.request>, i32) -> ()
+// CHECK-NEXT:        "scf.if"(%45) ({
+// CHECK-NEXT:          %145 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %146 = memref.subview %145[53, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 584537>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex0, %146) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 584537>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%64) ({
+// CHECK-NEXT:          %147 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %148 = memref.subview %147[1, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 11237>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex1, %148) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 11237>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%82) ({
+// CHECK-NEXT:          %149 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %150 = memref.subview %149[2, 103, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32867>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex2, %150) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 32867>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%100) ({
+// CHECK-NEXT:          %151 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %152 = memref.subview %151[2, 1, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22157>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex3, %152) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 22157>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%118) ({
+// CHECK-NEXT:          %153 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %154 = memref.subview %153[2, 2, 103] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22363>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex4, %154) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22363>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        "scf.if"(%135) ({
+// CHECK-NEXT:          %155 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %156 = memref.subview %155[2, 2, 1] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22261>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex5, %156) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22261>>) -> ()
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          scf.yield
+// CHECK-NEXT:        }) : (i1) -> ()
+// CHECK-NEXT:        %157 = memref.subview %u_t1[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t0_blk = memref.subview %u_t0[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t2_blk = memref.subview %u_t2[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %158 = arith.constant 0 : index
+// CHECK-NEXT:        %159 = arith.constant 51 : index
+// CHECK-NEXT:        %160 = arith.constant 101 : index
+// CHECK-NEXT:        %161 = arith.constant 64 : index
+// CHECK-NEXT:        %162 = arith.muli %time_m, %161 : index
+// CHECK-NEXT:        "scf.parallel"(%158, %158, %159, %160, %162, %162) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
+// CHECK-NEXT:        ^0(%163 : index, %164 : index):
+// CHECK-NEXT:          %165 = "affine.min"(%161, %159, %163) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          %166 = "affine.min"(%161, %160, %164) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          "scf.parallel"(%158, %158, %158, %165, %166, %160, %time_m, %time_m, %time_m) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
+// CHECK-NEXT:          ^1(%167 : index, %168 : index, %169 : index):
+// CHECK-NEXT:            %170 = arith.addi %163, %167 : index
+// CHECK-NEXT:            %171 = arith.addi %164, %168 : index
 // CHECK-NEXT:            %dt = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %29 = arith.constant 2 : i64
-// CHECK-NEXT:            %30 = "math.fpowi"(%dt, %29) : (f32, i64) -> f32
-// CHECK-NEXT:            %31 = arith.constant -1 : i64
-// CHECK-NEXT:            %dt_1 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %32 = arith.constant -2 : i64
-// CHECK-NEXT:            %33 = "math.fpowi"(%dt_1, %32) : (f32, i64) -> f32
-// CHECK-NEXT:            %34 = memref.load %u_t2_loadview[%27, %28, %26] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %35 = arith.mulf %33, %34 : f32
-// CHECK-NEXT:            %36 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %dt_2 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %37 = arith.constant -2 : i64
-// CHECK-NEXT:            %38 = "math.fpowi"(%dt_2, %37) : (f32, i64) -> f32
-// CHECK-NEXT:            %39 = memref.load %u_t0_loadview[%27, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %40 = arith.mulf %36, %38 : f32
-// CHECK-NEXT:            %41 = arith.mulf %40, %39 : f32
-// CHECK-NEXT:            %42 = arith.addf %35, %41 : f32
-// CHECK-NEXT:            %43 = arith.sitofp %31 : i64 to f32
-// CHECK-NEXT:            %44 = arith.mulf %43, %42 : f32
+// CHECK-NEXT:            %172 = arith.constant 2 : i64
+// CHECK-NEXT:            %173 = "math.fpowi"(%dt, %172) : (f32, i64) -> f32
+// CHECK-NEXT:            %174 = arith.constant -1 : i64
+// CHECK-NEXT:            %175 = arith.constant -2 : i64
+// CHECK-NEXT:            %176 = "math.fpowi"(%dt, %175) : (f32, i64) -> f32
+// CHECK-NEXT:            %177 = memref.load %u_t2_blk[%170, %171, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %178 = arith.mulf %176, %177 : f32
+// CHECK-NEXT:            %179 = arith.constant -2.000000e+00 : f32
+// CHECK-NEXT:            %180 = memref.load %u_t0_blk[%170, %171, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %181 = arith.mulf %179, %176 : f32
+// CHECK-NEXT:            %182 = arith.mulf %181, %180 : f32
+// CHECK-NEXT:            %183 = arith.addf %178, %182 : f32
+// CHECK-NEXT:            %184 = arith.sitofp %174 : i64 to f32
+// CHECK-NEXT:            %185 = arith.mulf %184, %183 : f32
 // CHECK-NEXT:            %h_x = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %45 = arith.constant -2 : i64
-// CHECK-NEXT:            %46 = "math.fpowi"(%h_x, %45) : (f32, i64) -> f32
-// CHECK-NEXT:            %47 = arith.constant -1 : index
-// CHECK-NEXT:            %48 = arith.addi %27, %47 : index
-// CHECK-NEXT:            %49 = memref.load %u_t0_loadview[%48, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %50 = arith.mulf %46, %49 : f32
-// CHECK-NEXT:            %h_x_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %51 = arith.constant -2 : i64
-// CHECK-NEXT:            %52 = "math.fpowi"(%h_x_1, %51) : (f32, i64) -> f32
-// CHECK-NEXT:            %53 = arith.constant 1 : index
-// CHECK-NEXT:            %54 = arith.addi %27, %53 : index
-// CHECK-NEXT:            %55 = memref.load %u_t0_loadview[%54, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %56 = arith.mulf %52, %55 : f32
-// CHECK-NEXT:            %57 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_x_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %58 = arith.constant -2 : i64
-// CHECK-NEXT:            %59 = "math.fpowi"(%h_x_2, %58) : (f32, i64) -> f32
-// CHECK-NEXT:            %60 = memref.load %u_t0_loadview[%27, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %61 = arith.mulf %57, %59 : f32
-// CHECK-NEXT:            %62 = arith.mulf %61, %60 : f32
-// CHECK-NEXT:            %63 = arith.addf %50, %56 : f32
-// CHECK-NEXT:            %64 = arith.addf %63, %62 : f32
-// CHECK-NEXT:            %h_y = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %65 = arith.constant -2 : i64
-// CHECK-NEXT:            %66 = "math.fpowi"(%h_y, %65) : (f32, i64) -> f32
-// CHECK-NEXT:            %67 = arith.constant -1 : index
-// CHECK-NEXT:            %68 = arith.addi %28, %67 : index
-// CHECK-NEXT:            %69 = memref.load %u_t0_loadview[%27, %68, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %70 = arith.mulf %66, %69 : f32
-// CHECK-NEXT:            %h_y_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %71 = arith.constant -2 : i64
-// CHECK-NEXT:            %72 = "math.fpowi"(%h_y_1, %71) : (f32, i64) -> f32
-// CHECK-NEXT:            %73 = arith.constant 1 : index
-// CHECK-NEXT:            %74 = arith.addi %28, %73 : index
-// CHECK-NEXT:            %75 = memref.load %u_t0_loadview[%27, %74, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %76 = arith.mulf %72, %75 : f32
-// CHECK-NEXT:            %77 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_y_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %78 = arith.constant -2 : i64
-// CHECK-NEXT:            %79 = "math.fpowi"(%h_y_2, %78) : (f32, i64) -> f32
-// CHECK-NEXT:            %80 = memref.load %u_t0_loadview[%27, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %81 = arith.mulf %77, %79 : f32
-// CHECK-NEXT:            %82 = arith.mulf %81, %80 : f32
-// CHECK-NEXT:            %83 = arith.addf %70, %76 : f32
-// CHECK-NEXT:            %84 = arith.addf %83, %82 : f32
-// CHECK-NEXT:            %h_z = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %85 = arith.constant -2 : i64
-// CHECK-NEXT:            %86 = "math.fpowi"(%h_z, %85) : (f32, i64) -> f32
-// CHECK-NEXT:            %87 = arith.constant -1 : index
-// CHECK-NEXT:            %88 = arith.addi %26, %87 : index
-// CHECK-NEXT:            %89 = memref.load %u_t0_loadview[%27, %28, %88] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %90 = arith.mulf %86, %89 : f32
-// CHECK-NEXT:            %h_z_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %91 = arith.constant -2 : i64
-// CHECK-NEXT:            %92 = "math.fpowi"(%h_z_1, %91) : (f32, i64) -> f32
-// CHECK-NEXT:            %93 = arith.constant 1 : index
-// CHECK-NEXT:            %94 = arith.addi %26, %93 : index
-// CHECK-NEXT:            %95 = memref.load %u_t0_loadview[%27, %28, %94] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %96 = arith.mulf %92, %95 : f32
-// CHECK-NEXT:            %97 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_z_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %98 = arith.constant -2 : i64
-// CHECK-NEXT:            %99 = "math.fpowi"(%h_z_2, %98) : (f32, i64) -> f32
-// CHECK-NEXT:            %100 = memref.load %u_t0_loadview[%27, %28, %26] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %101 = arith.mulf %97, %99 : f32
-// CHECK-NEXT:            %102 = arith.mulf %101, %100 : f32
-// CHECK-NEXT:            %103 = arith.addf %90, %96 : f32
-// CHECK-NEXT:            %104 = arith.addf %103, %102 : f32
-// CHECK-NEXT:            %105 = arith.addf %44, %64 : f32
-// CHECK-NEXT:            %106 = arith.addf %105, %84 : f32
-// CHECK-NEXT:            %107 = arith.addf %106, %104 : f32
-// CHECK-NEXT:            %108 = arith.mulf %30, %107 : f32
-// CHECK-NEXT:            memref.store %108, %u_t1_storeview[%27, %28, %26] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %186 = "math.fpowi"(%h_x, %175) : (f32, i64) -> f32
+// CHECK-NEXT:            %187 = arith.constant -1 : index
+// CHECK-NEXT:            %188 = arith.addi %170, %187 : index
+// CHECK-NEXT:            %189 = memref.load %u_t0_blk[%188, %171, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %190 = arith.mulf %186, %189 : f32
+// CHECK-NEXT:            %191 = arith.addi %170, %time_m : index
+// CHECK-NEXT:            %192 = memref.load %u_t0_blk[%191, %171, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %193 = arith.mulf %186, %192 : f32
+// CHECK-NEXT:            %194 = arith.mulf %179, %186 : f32
+// CHECK-NEXT:            %195 = arith.mulf %194, %180 : f32
+// CHECK-NEXT:            %196 = arith.addf %190, %193 : f32
+// CHECK-NEXT:            %197 = arith.addf %196, %195 : f32
+// CHECK-NEXT:            %198 = arith.addi %171, %187 : index
+// CHECK-NEXT:            %199 = memref.load %u_t0_blk[%170, %198, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %200 = arith.mulf %186, %199 : f32
+// CHECK-NEXT:            %201 = arith.addi %171, %time_m : index
+// CHECK-NEXT:            %202 = memref.load %u_t0_blk[%170, %201, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %203 = arith.mulf %186, %202 : f32
+// CHECK-NEXT:            %204 = arith.addf %200, %203 : f32
+// CHECK-NEXT:            %205 = arith.addf %204, %195 : f32
+// CHECK-NEXT:            %206 = arith.addi %169, %187 : index
+// CHECK-NEXT:            %207 = memref.load %u_t0_blk[%170, %171, %206] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %208 = arith.mulf %186, %207 : f32
+// CHECK-NEXT:            %209 = arith.addi %169, %time_m : index
+// CHECK-NEXT:            %210 = memref.load %u_t0_blk[%170, %171, %209] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %211 = arith.mulf %186, %210 : f32
+// CHECK-NEXT:            %212 = arith.addf %208, %211 : f32
+// CHECK-NEXT:            %213 = arith.addf %212, %195 : f32
+// CHECK-NEXT:            %214 = arith.addf %185, %197 : f32
+// CHECK-NEXT:            %215 = arith.addf %214, %205 : f32
+// CHECK-NEXT:            %216 = arith.addf %215, %213 : f32
+// CHECK-NEXT:            %217 = arith.mulf %173, %216 : f32
+// CHECK-NEXT:            memref.store %217, %157[%170, %171, %169] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:            scf.yield
 // CHECK-NEXT:          }) : (index, index, index, index, index, index, index, index, index) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (index, index, index, index, index, index) -> ()
-// CHECK-NEXT:        %u_t1_temp = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %109 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%109, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %218 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%218, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
 // CHECK-NEXT:    func.func private @timer_end(f64) -> f64
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT:  }
diff --git a/tests/filecheck/xdsl_mpi_pipeline_e.mlir b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
index 14e82658d0..d39ba2b16a 100644
--- a/tests/filecheck/xdsl_mpi_pipeline_e.mlir
+++ b/tests/filecheck/xdsl_mpi_pipeline_e.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0},dmp-to-mpi{mpi_init=false},lower-mpi" %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,distribute-stencil{strategy=3d-grid slices=2,1,1 restrict_domain=false},shape-inference,canonicalize-dmp,stencil-bufferize,dmp-to-mpi{mpi_init=false},convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,64,0},dmp-to-mpi{mpi_init=false},lower-mpi,canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%u_vec0 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec1 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %u_vec2 : !stencil.field<[-2,53]x[-2,103]x[-2,103]xf32>, %timers : !llvm.ptr) {
@@ -110,586 +110,429 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 1 : index
 // CHECK-NEXT:      %time_M = arith.constant 20 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3 = arith.constant 12 : i32
-// CHECK-NEXT:      %4 = "llvm.alloca"(%3) <{"alignment" = 32 : i64, "elem_type" = i32}> : (i32) -> !llvm.ptr
-// CHECK-NEXT:      %5 = arith.constant 1140850688 : i32
-// CHECK-NEXT:      %6 = arith.constant 1 : i64
-// CHECK-NEXT:      %7 = "llvm.alloca"(%6) <{"alignment" = 32 : i64, "elem_type" = i32}> : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %8 = func.call @MPI_Comm_rank(%5, %7) : (i32, !llvm.ptr) -> i32
-// CHECK-NEXT:      %9 = "llvm.load"(%7) : (!llvm.ptr) -> i32
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_m : index
+// CHECK-NEXT:      %2 = arith.constant 12 : i32
+// CHECK-NEXT:      %3 = "llvm.alloca"(%2) <{"alignment" = 32 : i64, "elem_type" = i32}> : (i32) -> !llvm.ptr
+// CHECK-NEXT:      %4 = arith.constant 1140850688 : i32
+// CHECK-NEXT:      %5 = arith.constant 1 : i64
+// CHECK-NEXT:      %6 = "llvm.alloca"(%5) <{"alignment" = 32 : i64, "elem_type" = i32}> : (i64) -> !llvm.ptr
+// CHECK-NEXT:      %7 = func.call @MPI_Comm_rank(%4, %6) : (i32, !llvm.ptr) -> i32
+// CHECK-NEXT:      %8 = "llvm.load"(%6) : (!llvm.ptr) -> i32
 // CHECK-NEXT:      %send_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
-// CHECK-NEXT:      %10 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex0) : (memref<101x101xf32>) -> index
-// CHECK-NEXT:      %11 = arith.index_cast %10 : index to i64
-// CHECK-NEXT:      %send_buff_ex0_ptr = "llvm.inttoptr"(%11) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %12 = arith.constant 10201 : i32
-// CHECK-NEXT:      %13 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %9 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex0) : (memref<101x101xf32>) -> index
+// CHECK-NEXT:      %10 = arith.index_cast %9 : index to i64
+// CHECK-NEXT:      %send_buff_ex0_ptr = "llvm.inttoptr"(%10) : (i64) -> !llvm.ptr
+// CHECK-NEXT:      %11 = arith.constant 10201 : i32
+// CHECK-NEXT:      %12 = arith.constant 1275069450 : i32
 // CHECK-NEXT:      %recv_buff_ex0 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
-// CHECK-NEXT:      %14 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex0) : (memref<101x101xf32>) -> index
-// CHECK-NEXT:      %15 = arith.index_cast %14 : index to i64
-// CHECK-NEXT:      %recv_buff_ex0_ptr = "llvm.inttoptr"(%15) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %16 = arith.constant 10201 : i32
-// CHECK-NEXT:      %17 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %13 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex0) : (memref<101x101xf32>) -> index
+// CHECK-NEXT:      %14 = arith.index_cast %13 : index to i64
+// CHECK-NEXT:      %recv_buff_ex0_ptr = "llvm.inttoptr"(%14) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %send_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
-// CHECK-NEXT:      %18 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex1) : (memref<101x101xf32>) -> index
-// CHECK-NEXT:      %19 = arith.index_cast %18 : index to i64
-// CHECK-NEXT:      %send_buff_ex1_ptr = "llvm.inttoptr"(%19) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %20 = arith.constant 10201 : i32
-// CHECK-NEXT:      %21 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %15 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex1) : (memref<101x101xf32>) -> index
+// CHECK-NEXT:      %16 = arith.index_cast %15 : index to i64
+// CHECK-NEXT:      %send_buff_ex1_ptr = "llvm.inttoptr"(%16) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %recv_buff_ex1 = memref.alloc() {"alignment" = 64 : i64} : memref<101x101xf32>
-// CHECK-NEXT:      %22 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex1) : (memref<101x101xf32>) -> index
-// CHECK-NEXT:      %23 = arith.index_cast %22 : index to i64
-// CHECK-NEXT:      %recv_buff_ex1_ptr = "llvm.inttoptr"(%23) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %24 = arith.constant 10201 : i32
-// CHECK-NEXT:      %25 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %17 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex1) : (memref<101x101xf32>) -> index
+// CHECK-NEXT:      %18 = arith.index_cast %17 : index to i64
+// CHECK-NEXT:      %recv_buff_ex1_ptr = "llvm.inttoptr"(%18) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %send_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %26 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex2) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %27 = arith.index_cast %26 : index to i64
-// CHECK-NEXT:      %send_buff_ex2_ptr = "llvm.inttoptr"(%27) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %28 = arith.constant 5151 : i32
-// CHECK-NEXT:      %29 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %19 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex2) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %20 = arith.index_cast %19 : index to i64
+// CHECK-NEXT:      %send_buff_ex2_ptr = "llvm.inttoptr"(%20) : (i64) -> !llvm.ptr
+// CHECK-NEXT:      %21 = arith.constant 5151 : i32
 // CHECK-NEXT:      %recv_buff_ex2 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %30 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex2) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %31 = arith.index_cast %30 : index to i64
-// CHECK-NEXT:      %recv_buff_ex2_ptr = "llvm.inttoptr"(%31) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %32 = arith.constant 5151 : i32
-// CHECK-NEXT:      %33 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %22 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex2) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %23 = arith.index_cast %22 : index to i64
+// CHECK-NEXT:      %recv_buff_ex2_ptr = "llvm.inttoptr"(%23) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %send_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %34 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex3) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %35 = arith.index_cast %34 : index to i64
-// CHECK-NEXT:      %send_buff_ex3_ptr = "llvm.inttoptr"(%35) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %36 = arith.constant 5151 : i32
-// CHECK-NEXT:      %37 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %24 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex3) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %25 = arith.index_cast %24 : index to i64
+// CHECK-NEXT:      %send_buff_ex3_ptr = "llvm.inttoptr"(%25) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %recv_buff_ex3 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %38 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex3) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %39 = arith.index_cast %38 : index to i64
-// CHECK-NEXT:      %recv_buff_ex3_ptr = "llvm.inttoptr"(%39) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %40 = arith.constant 5151 : i32
-// CHECK-NEXT:      %41 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %26 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex3) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %27 = arith.index_cast %26 : index to i64
+// CHECK-NEXT:      %recv_buff_ex3_ptr = "llvm.inttoptr"(%27) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %send_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %42 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex4) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %43 = arith.index_cast %42 : index to i64
-// CHECK-NEXT:      %send_buff_ex4_ptr = "llvm.inttoptr"(%43) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %44 = arith.constant 5151 : i32
-// CHECK-NEXT:      %45 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %28 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex4) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %29 = arith.index_cast %28 : index to i64
+// CHECK-NEXT:      %send_buff_ex4_ptr = "llvm.inttoptr"(%29) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %recv_buff_ex4 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %46 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex4) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %47 = arith.index_cast %46 : index to i64
-// CHECK-NEXT:      %recv_buff_ex4_ptr = "llvm.inttoptr"(%47) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %48 = arith.constant 5151 : i32
-// CHECK-NEXT:      %49 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %30 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex4) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %31 = arith.index_cast %30 : index to i64
+// CHECK-NEXT:      %recv_buff_ex4_ptr = "llvm.inttoptr"(%31) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %send_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %50 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex5) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %51 = arith.index_cast %50 : index to i64
-// CHECK-NEXT:      %send_buff_ex5_ptr = "llvm.inttoptr"(%51) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %52 = arith.constant 5151 : i32
-// CHECK-NEXT:      %53 = arith.constant 1275069450 : i32
+// CHECK-NEXT:      %32 = "memref.extract_aligned_pointer_as_index"(%send_buff_ex5) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %33 = arith.index_cast %32 : index to i64
+// CHECK-NEXT:      %send_buff_ex5_ptr = "llvm.inttoptr"(%33) : (i64) -> !llvm.ptr
 // CHECK-NEXT:      %recv_buff_ex5 = memref.alloc() {"alignment" = 64 : i64} : memref<51x101xf32>
-// CHECK-NEXT:      %54 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex5) : (memref<51x101xf32>) -> index
-// CHECK-NEXT:      %55 = arith.index_cast %54 : index to i64
-// CHECK-NEXT:      %recv_buff_ex5_ptr = "llvm.inttoptr"(%55) : (i64) -> !llvm.ptr
-// CHECK-NEXT:      %56 = arith.constant 5151 : i32
-// CHECK-NEXT:      %57 = arith.constant 1275069450 : i32
-// CHECK-NEXT:      %58, %59, %60 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
-// CHECK-NEXT:        %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 53, 103, 103>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %61 = arith.constant 0 : i32
-// CHECK-NEXT:        %62 = arith.constant 1 : i32
-// CHECK-NEXT:        %63 = arith.divui %9, %62 : i32
-// CHECK-NEXT:        %64 = arith.remui %9, %62 : i32
-// CHECK-NEXT:        %65 = arith.constant 1 : i32
-// CHECK-NEXT:        %66 = arith.divui %64, %65 : i32
-// CHECK-NEXT:        %67 = arith.remui %64, %65 : i32
-// CHECK-NEXT:        %68 = arith.constant 1 : i32
-// CHECK-NEXT:        %69 = arith.divui %67, %68 : i32
-// CHECK-NEXT:        %70 = arith.remui %67, %68 : i32
-// CHECK-NEXT:        %71 = arith.constant 1 : i32
-// CHECK-NEXT:        %72 = arith.addi %63, %71 : i32
-// CHECK-NEXT:        %73 = arith.constant 2 : i32
-// CHECK-NEXT:        %74 = arith.cmpi slt, %72, %73 : i32
-// CHECK-NEXT:        %75 = arith.constant true
-// CHECK-NEXT:        %76 = arith.constant true
-// CHECK-NEXT:        %77 = arith.andi %74, %75 : i1
-// CHECK-NEXT:        %78 = arith.andi %77, %76 : i1
-// CHECK-NEXT:        %79 = arith.constant 1 : i32
-// CHECK-NEXT:        %80 = arith.muli %79, %72 : i32
-// CHECK-NEXT:        %81 = arith.addi %69, %80 : i32
-// CHECK-NEXT:        %82 = arith.constant 1 : i32
-// CHECK-NEXT:        %83 = arith.muli %82, %66 : i32
-// CHECK-NEXT:        %84 = arith.addi %81, %83 : i32
-// CHECK-NEXT:        %85 = arith.constant 0 : i32
-// CHECK-NEXT:        %86 = arith.constant 6 : i32
-// CHECK-NEXT:        %87 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %88 = arith.constant 4 : i64
-// CHECK-NEXT:        %89 = arith.index_cast %85 : i32 to index
-// CHECK-NEXT:        %90 = arith.index_cast %89 : index to i64
-// CHECK-NEXT:        %91 = arith.muli %88, %90 : i64
-// CHECK-NEXT:        %92 = arith.addi %91, %87 : i64
-// CHECK-NEXT:        %93 = "llvm.inttoptr"(%92) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %94 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %95 = arith.constant 4 : i64
-// CHECK-NEXT:        %96 = arith.index_cast %86 : i32 to index
-// CHECK-NEXT:        %97 = arith.index_cast %96 : index to i64
-// CHECK-NEXT:        %98 = arith.muli %95, %97 : i64
-// CHECK-NEXT:        %99 = arith.addi %98, %94 : i64
-// CHECK-NEXT:        %100 = "llvm.inttoptr"(%99) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%78) ({
-// CHECK-NEXT:          %101 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 50, 0, 0>, "static_sizes" = array<i64: 1, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<101x101xf32, strided<[105, 1], offset: 573512>>
-// CHECK-NEXT:          "memref.copy"(%101, %send_buff_ex0) : (memref<101x101xf32, strided<[105, 1], offset: 573512>>, memref<101x101xf32>) -> ()
-// CHECK-NEXT:          %102 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %103 = func.call @MPI_Isend(%send_buff_ex0_ptr, %12, %13, %84, %61, %102, %93) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %104 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %105 = func.call @MPI_Irecv(%recv_buff_ex0_ptr, %16, %17, %84, %61, %104, %100) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:      %34 = "memref.extract_aligned_pointer_as_index"(%recv_buff_ex5) : (memref<51x101xf32>) -> index
+// CHECK-NEXT:      %35 = arith.index_cast %34 : index to i64
+// CHECK-NEXT:      %recv_buff_ex5_ptr = "llvm.inttoptr"(%35) : (i64) -> !llvm.ptr
+// CHECK-NEXT:      %36, %37, %38 = scf.for %time = %time_m to %1 step %time_m iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1, %u_t2 = %u_vec2) -> (memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>) {
+// CHECK-NEXT:        %39 = arith.constant 0 : i32
+// CHECK-NEXT:        %40 = arith.constant 1 : i32
+// CHECK-NEXT:        %41 = arith.divui %8, %40 : i32
+// CHECK-NEXT:        %42 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %43 = arith.divui %42, %40 : i32
+// CHECK-NEXT:        %44 = arith.remui %42, %40 : i32
+// CHECK-NEXT:        %45 = arith.divui %44, %40 : i32
+// CHECK-NEXT:        %46 = arith.remui %44, %40 : i32
+// CHECK-NEXT:        %47 = arith.addi %41, %40 : i32
+// CHECK-NEXT:        %48 = arith.constant 2 : i32
+// CHECK-NEXT:        %49 = arith.cmpi slt, %47, %48 : i32
+// CHECK-NEXT:        %50 = arith.constant true
+// CHECK-NEXT:        %51 = arith.andi %49, %50 : i1
+// CHECK-NEXT:        %52 = arith.andi %51, %50 : i1
+// CHECK-NEXT:        %53 = arith.muli %40, %47 : i32
+// CHECK-NEXT:        %54 = arith.addi %45, %53 : i32
+// CHECK-NEXT:        %55 = arith.muli %40, %43 : i32
+// CHECK-NEXT:        %56 = arith.addi %54, %55 : i32
+// CHECK-NEXT:        %57 = arith.constant 6 : i32
+// CHECK-NEXT:        %58 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %59 = arith.constant 4 : i64
+// CHECK-NEXT:        %60 = arith.index_cast %39 : i32 to index
+// CHECK-NEXT:        %61 = arith.index_cast %60 : index to i64
+// CHECK-NEXT:        %62 = arith.muli %59, %61 : i64
+// CHECK-NEXT:        %63 = arith.addi %62, %58 : i64
+// CHECK-NEXT:        %64 = "llvm.inttoptr"(%63) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %65 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %66 = arith.index_cast %57 : i32 to index
+// CHECK-NEXT:        %67 = arith.index_cast %66 : index to i64
+// CHECK-NEXT:        %68 = arith.muli %59, %67 : i64
+// CHECK-NEXT:        %69 = arith.addi %68, %65 : i64
+// CHECK-NEXT:        %70 = "llvm.inttoptr"(%69) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%52) ({
+// CHECK-NEXT:          %71 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %72 = memref.subview %71[52, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 573512>>
+// CHECK-NEXT:          "memref.copy"(%72, %send_buff_ex0) : (memref<101x101xf32, strided<[105, 1], offset: 573512>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          %73 = func.call @MPI_Isend(%send_buff_ex0_ptr, %11, %12, %56, %39, %4, %64) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %74 = func.call @MPI_Irecv(%recv_buff_ex0_ptr, %11, %12, %56, %39, %4, %70) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %106 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%106, %93) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %107 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%107, %100) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %75 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%75, %64) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%75, %70) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %108 = arith.constant 1 : i32
-// CHECK-NEXT:        %109 = arith.divui %9, %108 : i32
-// CHECK-NEXT:        %110 = arith.remui %9, %108 : i32
-// CHECK-NEXT:        %111 = arith.constant 1 : i32
-// CHECK-NEXT:        %112 = arith.divui %110, %111 : i32
-// CHECK-NEXT:        %113 = arith.remui %110, %111 : i32
-// CHECK-NEXT:        %114 = arith.constant 1 : i32
-// CHECK-NEXT:        %115 = arith.divui %113, %114 : i32
-// CHECK-NEXT:        %116 = arith.remui %113, %114 : i32
-// CHECK-NEXT:        %117 = arith.constant -1 : i32
-// CHECK-NEXT:        %118 = arith.addi %109, %117 : i32
-// CHECK-NEXT:        %119 = arith.constant 0 : i32
-// CHECK-NEXT:        %120 = arith.cmpi sge, %118, %119 : i32
-// CHECK-NEXT:        %121 = arith.constant true
-// CHECK-NEXT:        %122 = arith.constant true
-// CHECK-NEXT:        %123 = arith.andi %120, %121 : i1
-// CHECK-NEXT:        %124 = arith.andi %123, %122 : i1
-// CHECK-NEXT:        %125 = arith.constant 1 : i32
-// CHECK-NEXT:        %126 = arith.muli %125, %118 : i32
-// CHECK-NEXT:        %127 = arith.addi %115, %126 : i32
-// CHECK-NEXT:        %128 = arith.constant 1 : i32
-// CHECK-NEXT:        %129 = arith.muli %128, %112 : i32
-// CHECK-NEXT:        %130 = arith.addi %127, %129 : i32
-// CHECK-NEXT:        %131 = arith.constant 1 : i32
-// CHECK-NEXT:        %132 = arith.constant 7 : i32
-// CHECK-NEXT:        %133 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %134 = arith.constant 4 : i64
-// CHECK-NEXT:        %135 = arith.index_cast %131 : i32 to index
-// CHECK-NEXT:        %136 = arith.index_cast %135 : index to i64
-// CHECK-NEXT:        %137 = arith.muli %134, %136 : i64
-// CHECK-NEXT:        %138 = arith.addi %137, %133 : i64
-// CHECK-NEXT:        %139 = "llvm.inttoptr"(%138) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %140 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %141 = arith.constant 4 : i64
-// CHECK-NEXT:        %142 = arith.index_cast %132 : i32 to index
-// CHECK-NEXT:        %143 = arith.index_cast %142 : index to i64
-// CHECK-NEXT:        %144 = arith.muli %141, %143 : i64
-// CHECK-NEXT:        %145 = arith.addi %144, %140 : i64
-// CHECK-NEXT:        %146 = "llvm.inttoptr"(%145) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%124) ({
-// CHECK-NEXT:          %147 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, 0>, "static_sizes" = array<i64: 1, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<101x101xf32, strided<[105, 1], offset: 22262>>
-// CHECK-NEXT:          "memref.copy"(%147, %send_buff_ex1) : (memref<101x101xf32, strided<[105, 1], offset: 22262>>, memref<101x101xf32>) -> ()
-// CHECK-NEXT:          %148 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %149 = func.call @MPI_Isend(%send_buff_ex1_ptr, %20, %21, %130, %61, %148, %139) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %150 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %151 = func.call @MPI_Irecv(%recv_buff_ex1_ptr, %24, %25, %130, %61, %150, %146) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:        %76 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %77 = arith.divui %76, %40 : i32
+// CHECK-NEXT:        %78 = arith.remui %76, %40 : i32
+// CHECK-NEXT:        %79 = arith.divui %78, %40 : i32
+// CHECK-NEXT:        %80 = arith.remui %78, %40 : i32
+// CHECK-NEXT:        %81 = arith.constant -1 : i32
+// CHECK-NEXT:        %82 = arith.addi %41, %81 : i32
+// CHECK-NEXT:        %83 = arith.cmpi sge, %82, %39 : i32
+// CHECK-NEXT:        %84 = arith.andi %83, %50 : i1
+// CHECK-NEXT:        %85 = arith.andi %84, %50 : i1
+// CHECK-NEXT:        %86 = arith.muli %40, %82 : i32
+// CHECK-NEXT:        %87 = arith.addi %79, %86 : i32
+// CHECK-NEXT:        %88 = arith.muli %40, %77 : i32
+// CHECK-NEXT:        %89 = arith.addi %87, %88 : i32
+// CHECK-NEXT:        %90 = arith.constant 7 : i32
+// CHECK-NEXT:        %91 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %92 = arith.index_cast %40 : i32 to index
+// CHECK-NEXT:        %93 = arith.index_cast %92 : index to i64
+// CHECK-NEXT:        %94 = arith.muli %59, %93 : i64
+// CHECK-NEXT:        %95 = arith.addi %94, %91 : i64
+// CHECK-NEXT:        %96 = "llvm.inttoptr"(%95) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %97 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %98 = arith.index_cast %90 : i32 to index
+// CHECK-NEXT:        %99 = arith.index_cast %98 : index to i64
+// CHECK-NEXT:        %100 = arith.muli %59, %99 : i64
+// CHECK-NEXT:        %101 = arith.addi %100, %97 : i64
+// CHECK-NEXT:        %102 = "llvm.inttoptr"(%101) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%85) ({
+// CHECK-NEXT:          %103 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %104 = memref.subview %103[2, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%104, %send_buff_ex1) : (memref<101x101xf32, strided<[105, 1], offset: 22262>>, memref<101x101xf32>) -> ()
+// CHECK-NEXT:          %105 = func.call @MPI_Isend(%send_buff_ex1_ptr, %11, %12, %89, %39, %4, %96) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %106 = func.call @MPI_Irecv(%recv_buff_ex1_ptr, %11, %12, %89, %39, %4, %102) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %152 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%152, %139) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %153 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%153, %146) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %107 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%107, %96) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%107, %102) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %154 = arith.constant 1 : i32
-// CHECK-NEXT:        %155 = arith.divui %9, %154 : i32
-// CHECK-NEXT:        %156 = arith.remui %9, %154 : i32
-// CHECK-NEXT:        %157 = arith.constant 1 : i32
-// CHECK-NEXT:        %158 = arith.divui %156, %157 : i32
-// CHECK-NEXT:        %159 = arith.remui %156, %157 : i32
-// CHECK-NEXT:        %160 = arith.constant 1 : i32
-// CHECK-NEXT:        %161 = arith.divui %159, %160 : i32
-// CHECK-NEXT:        %162 = arith.remui %159, %160 : i32
-// CHECK-NEXT:        %163 = arith.constant true
-// CHECK-NEXT:        %164 = arith.constant 1 : i32
-// CHECK-NEXT:        %165 = arith.addi %158, %164 : i32
-// CHECK-NEXT:        %166 = arith.constant 1 : i32
-// CHECK-NEXT:        %167 = arith.cmpi slt, %165, %166 : i32
-// CHECK-NEXT:        %168 = arith.constant true
-// CHECK-NEXT:        %169 = arith.andi %163, %167 : i1
-// CHECK-NEXT:        %170 = arith.andi %169, %168 : i1
-// CHECK-NEXT:        %171 = arith.constant 1 : i32
-// CHECK-NEXT:        %172 = arith.muli %171, %155 : i32
-// CHECK-NEXT:        %173 = arith.addi %161, %172 : i32
-// CHECK-NEXT:        %174 = arith.constant 1 : i32
-// CHECK-NEXT:        %175 = arith.muli %174, %165 : i32
-// CHECK-NEXT:        %176 = arith.addi %173, %175 : i32
-// CHECK-NEXT:        %177 = arith.constant 2 : i32
-// CHECK-NEXT:        %178 = arith.constant 8 : i32
-// CHECK-NEXT:        %179 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %180 = arith.constant 4 : i64
-// CHECK-NEXT:        %181 = arith.index_cast %177 : i32 to index
-// CHECK-NEXT:        %182 = arith.index_cast %181 : index to i64
-// CHECK-NEXT:        %183 = arith.muli %180, %182 : i64
-// CHECK-NEXT:        %184 = arith.addi %183, %179 : i64
-// CHECK-NEXT:        %185 = "llvm.inttoptr"(%184) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %186 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %187 = arith.constant 4 : i64
-// CHECK-NEXT:        %188 = arith.index_cast %178 : i32 to index
-// CHECK-NEXT:        %189 = arith.index_cast %188 : index to i64
-// CHECK-NEXT:        %190 = arith.muli %187, %189 : i64
-// CHECK-NEXT:        %191 = arith.addi %190, %186 : i64
-// CHECK-NEXT:        %192 = "llvm.inttoptr"(%191) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%170) ({
-// CHECK-NEXT:          %193 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 100, 0>, "static_sizes" = array<i64: 51, 1, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 1], offset: 32762>>
-// CHECK-NEXT:          "memref.copy"(%193, %send_buff_ex2) : (memref<51x101xf32, strided<[11025, 1], offset: 32762>>, memref<51x101xf32>) -> ()
-// CHECK-NEXT:          %194 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %195 = func.call @MPI_Isend(%send_buff_ex2_ptr, %28, %29, %176, %61, %194, %185) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %196 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %197 = func.call @MPI_Irecv(%recv_buff_ex2_ptr, %32, %33, %176, %61, %196, %192) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:        %108 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %109 = arith.divui %108, %40 : i32
+// CHECK-NEXT:        %110 = arith.remui %108, %40 : i32
+// CHECK-NEXT:        %111 = arith.divui %110, %40 : i32
+// CHECK-NEXT:        %112 = arith.remui %110, %40 : i32
+// CHECK-NEXT:        %113 = arith.addi %109, %40 : i32
+// CHECK-NEXT:        %114 = arith.cmpi slt, %113, %40 : i32
+// CHECK-NEXT:        %115 = arith.andi %50, %114 : i1
+// CHECK-NEXT:        %116 = arith.andi %115, %50 : i1
+// CHECK-NEXT:        %117 = arith.muli %40, %41 : i32
+// CHECK-NEXT:        %118 = arith.addi %111, %117 : i32
+// CHECK-NEXT:        %119 = arith.muli %40, %113 : i32
+// CHECK-NEXT:        %120 = arith.addi %118, %119 : i32
+// CHECK-NEXT:        %121 = arith.constant 8 : i32
+// CHECK-NEXT:        %122 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %123 = arith.index_cast %48 : i32 to index
+// CHECK-NEXT:        %124 = arith.index_cast %123 : index to i64
+// CHECK-NEXT:        %125 = arith.muli %59, %124 : i64
+// CHECK-NEXT:        %126 = arith.addi %125, %122 : i64
+// CHECK-NEXT:        %127 = "llvm.inttoptr"(%126) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %128 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %129 = arith.index_cast %121 : i32 to index
+// CHECK-NEXT:        %130 = arith.index_cast %129 : index to i64
+// CHECK-NEXT:        %131 = arith.muli %59, %130 : i64
+// CHECK-NEXT:        %132 = arith.addi %131, %128 : i64
+// CHECK-NEXT:        %133 = "llvm.inttoptr"(%132) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%116) ({
+// CHECK-NEXT:          %134 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %135 = memref.subview %134[2, 102, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32762>>
+// CHECK-NEXT:          "memref.copy"(%135, %send_buff_ex2) : (memref<51x101xf32, strided<[11025, 1], offset: 32762>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          %136 = func.call @MPI_Isend(%send_buff_ex2_ptr, %21, %12, %120, %39, %4, %127) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %137 = func.call @MPI_Irecv(%recv_buff_ex2_ptr, %21, %12, %120, %39, %4, %133) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %198 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%198, %185) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %199 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%199, %192) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %138 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%138, %127) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%138, %133) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %200 = arith.constant 1 : i32
-// CHECK-NEXT:        %201 = arith.divui %9, %200 : i32
-// CHECK-NEXT:        %202 = arith.remui %9, %200 : i32
-// CHECK-NEXT:        %203 = arith.constant 1 : i32
-// CHECK-NEXT:        %204 = arith.divui %202, %203 : i32
-// CHECK-NEXT:        %205 = arith.remui %202, %203 : i32
-// CHECK-NEXT:        %206 = arith.constant 1 : i32
-// CHECK-NEXT:        %207 = arith.divui %205, %206 : i32
-// CHECK-NEXT:        %208 = arith.remui %205, %206 : i32
-// CHECK-NEXT:        %209 = arith.constant true
-// CHECK-NEXT:        %210 = arith.constant -1 : i32
-// CHECK-NEXT:        %211 = arith.addi %204, %210 : i32
-// CHECK-NEXT:        %212 = arith.constant 0 : i32
-// CHECK-NEXT:        %213 = arith.cmpi sge, %211, %212 : i32
-// CHECK-NEXT:        %214 = arith.constant true
-// CHECK-NEXT:        %215 = arith.andi %209, %213 : i1
-// CHECK-NEXT:        %216 = arith.andi %215, %214 : i1
-// CHECK-NEXT:        %217 = arith.constant 1 : i32
-// CHECK-NEXT:        %218 = arith.muli %217, %201 : i32
-// CHECK-NEXT:        %219 = arith.addi %207, %218 : i32
-// CHECK-NEXT:        %220 = arith.constant 1 : i32
-// CHECK-NEXT:        %221 = arith.muli %220, %211 : i32
-// CHECK-NEXT:        %222 = arith.addi %219, %221 : i32
-// CHECK-NEXT:        %223 = arith.constant 3 : i32
-// CHECK-NEXT:        %224 = arith.constant 9 : i32
-// CHECK-NEXT:        %225 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %226 = arith.constant 4 : i64
-// CHECK-NEXT:        %227 = arith.index_cast %223 : i32 to index
-// CHECK-NEXT:        %228 = arith.index_cast %227 : index to i64
-// CHECK-NEXT:        %229 = arith.muli %226, %228 : i64
-// CHECK-NEXT:        %230 = arith.addi %229, %225 : i64
-// CHECK-NEXT:        %231 = "llvm.inttoptr"(%230) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %232 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %233 = arith.constant 4 : i64
-// CHECK-NEXT:        %234 = arith.index_cast %224 : i32 to index
-// CHECK-NEXT:        %235 = arith.index_cast %234 : index to i64
-// CHECK-NEXT:        %236 = arith.muli %233, %235 : i64
-// CHECK-NEXT:        %237 = arith.addi %236, %232 : i64
-// CHECK-NEXT:        %238 = "llvm.inttoptr"(%237) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%216) ({
-// CHECK-NEXT:          %239 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, 0>, "static_sizes" = array<i64: 51, 1, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 1], offset: 22262>>
-// CHECK-NEXT:          "memref.copy"(%239, %send_buff_ex3) : (memref<51x101xf32, strided<[11025, 1], offset: 22262>>, memref<51x101xf32>) -> ()
-// CHECK-NEXT:          %240 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %241 = func.call @MPI_Isend(%send_buff_ex3_ptr, %36, %37, %222, %61, %240, %231) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %242 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %243 = func.call @MPI_Irecv(%recv_buff_ex3_ptr, %40, %41, %222, %61, %242, %238) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:        %139 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %140 = arith.divui %139, %40 : i32
+// CHECK-NEXT:        %141 = arith.remui %139, %40 : i32
+// CHECK-NEXT:        %142 = arith.divui %141, %40 : i32
+// CHECK-NEXT:        %143 = arith.remui %141, %40 : i32
+// CHECK-NEXT:        %144 = arith.addi %140, %81 : i32
+// CHECK-NEXT:        %145 = arith.cmpi sge, %144, %39 : i32
+// CHECK-NEXT:        %146 = arith.andi %50, %145 : i1
+// CHECK-NEXT:        %147 = arith.andi %146, %50 : i1
+// CHECK-NEXT:        %148 = arith.addi %142, %117 : i32
+// CHECK-NEXT:        %149 = arith.muli %40, %144 : i32
+// CHECK-NEXT:        %150 = arith.addi %148, %149 : i32
+// CHECK-NEXT:        %151 = arith.constant 3 : i32
+// CHECK-NEXT:        %152 = arith.constant 9 : i32
+// CHECK-NEXT:        %153 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %154 = arith.index_cast %151 : i32 to index
+// CHECK-NEXT:        %155 = arith.index_cast %154 : index to i64
+// CHECK-NEXT:        %156 = arith.muli %59, %155 : i64
+// CHECK-NEXT:        %157 = arith.addi %156, %153 : i64
+// CHECK-NEXT:        %158 = "llvm.inttoptr"(%157) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %159 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %160 = arith.index_cast %152 : i32 to index
+// CHECK-NEXT:        %161 = arith.index_cast %160 : index to i64
+// CHECK-NEXT:        %162 = arith.muli %59, %161 : i64
+// CHECK-NEXT:        %163 = arith.addi %162, %159 : i64
+// CHECK-NEXT:        %164 = "llvm.inttoptr"(%163) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%147) ({
+// CHECK-NEXT:          %165 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %166 = memref.subview %165[2, 2, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%166, %send_buff_ex3) : (memref<51x101xf32, strided<[11025, 1], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          %167 = func.call @MPI_Isend(%send_buff_ex3_ptr, %21, %12, %150, %39, %4, %158) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %168 = func.call @MPI_Irecv(%recv_buff_ex3_ptr, %21, %12, %150, %39, %4, %164) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %244 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%244, %231) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %245 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%245, %238) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %169 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%169, %158) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%169, %164) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %246 = arith.constant 1 : i32
-// CHECK-NEXT:        %247 = arith.divui %9, %246 : i32
-// CHECK-NEXT:        %248 = arith.remui %9, %246 : i32
-// CHECK-NEXT:        %249 = arith.constant 1 : i32
-// CHECK-NEXT:        %250 = arith.divui %248, %249 : i32
-// CHECK-NEXT:        %251 = arith.remui %248, %249 : i32
-// CHECK-NEXT:        %252 = arith.constant 1 : i32
-// CHECK-NEXT:        %253 = arith.divui %251, %252 : i32
-// CHECK-NEXT:        %254 = arith.remui %251, %252 : i32
-// CHECK-NEXT:        %255 = arith.constant true
-// CHECK-NEXT:        %256 = arith.constant true
-// CHECK-NEXT:        %257 = arith.constant 1 : i32
-// CHECK-NEXT:        %258 = arith.addi %253, %257 : i32
-// CHECK-NEXT:        %259 = arith.constant 1 : i32
-// CHECK-NEXT:        %260 = arith.cmpi slt, %258, %259 : i32
-// CHECK-NEXT:        %261 = arith.andi %255, %256 : i1
-// CHECK-NEXT:        %262 = arith.andi %261, %260 : i1
-// CHECK-NEXT:        %263 = arith.constant 1 : i32
-// CHECK-NEXT:        %264 = arith.muli %263, %247 : i32
-// CHECK-NEXT:        %265 = arith.addi %258, %264 : i32
-// CHECK-NEXT:        %266 = arith.constant 1 : i32
-// CHECK-NEXT:        %267 = arith.muli %266, %250 : i32
-// CHECK-NEXT:        %268 = arith.addi %265, %267 : i32
-// CHECK-NEXT:        %269 = arith.constant 4 : i32
-// CHECK-NEXT:        %270 = arith.constant 10 : i32
-// CHECK-NEXT:        %271 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %272 = arith.constant 4 : i64
-// CHECK-NEXT:        %273 = arith.index_cast %269 : i32 to index
-// CHECK-NEXT:        %274 = arith.index_cast %273 : index to i64
-// CHECK-NEXT:        %275 = arith.muli %272, %274 : i64
-// CHECK-NEXT:        %276 = arith.addi %275, %271 : i64
-// CHECK-NEXT:        %277 = "llvm.inttoptr"(%276) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %278 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %279 = arith.constant 4 : i64
-// CHECK-NEXT:        %280 = arith.index_cast %270 : i32 to index
-// CHECK-NEXT:        %281 = arith.index_cast %280 : index to i64
-// CHECK-NEXT:        %282 = arith.muli %279, %281 : i64
-// CHECK-NEXT:        %283 = arith.addi %282, %278 : i64
-// CHECK-NEXT:        %284 = "llvm.inttoptr"(%283) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%262) ({
-// CHECK-NEXT:          %285 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, 100>, "static_sizes" = array<i64: 51, 101, 1>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 105], offset: 22362>>
-// CHECK-NEXT:          "memref.copy"(%285, %send_buff_ex4) : (memref<51x101xf32, strided<[11025, 105], offset: 22362>>, memref<51x101xf32>) -> ()
-// CHECK-NEXT:          %286 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %287 = func.call @MPI_Isend(%send_buff_ex4_ptr, %44, %45, %268, %61, %286, %277) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %288 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %289 = func.call @MPI_Irecv(%recv_buff_ex4_ptr, %48, %49, %268, %61, %288, %284) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:        %170 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %171 = arith.divui %170, %40 : i32
+// CHECK-NEXT:        %172 = arith.remui %170, %40 : i32
+// CHECK-NEXT:        %173 = arith.divui %172, %40 : i32
+// CHECK-NEXT:        %174 = arith.remui %172, %40 : i32
+// CHECK-NEXT:        %175 = arith.addi %173, %40 : i32
+// CHECK-NEXT:        %176 = arith.cmpi slt, %175, %40 : i32
+// CHECK-NEXT:        %177 = arith.andi %50, %50 : i1
+// CHECK-NEXT:        %178 = arith.andi %177, %176 : i1
+// CHECK-NEXT:        %179 = arith.addi %175, %117 : i32
+// CHECK-NEXT:        %180 = arith.muli %40, %171 : i32
+// CHECK-NEXT:        %181 = arith.addi %179, %180 : i32
+// CHECK-NEXT:        %182 = arith.constant 4 : i32
+// CHECK-NEXT:        %183 = arith.constant 10 : i32
+// CHECK-NEXT:        %184 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %185 = arith.index_cast %182 : i32 to index
+// CHECK-NEXT:        %186 = arith.index_cast %185 : index to i64
+// CHECK-NEXT:        %187 = arith.muli %59, %186 : i64
+// CHECK-NEXT:        %188 = arith.addi %187, %184 : i64
+// CHECK-NEXT:        %189 = "llvm.inttoptr"(%188) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %190 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %191 = arith.index_cast %183 : i32 to index
+// CHECK-NEXT:        %192 = arith.index_cast %191 : index to i64
+// CHECK-NEXT:        %193 = arith.muli %59, %192 : i64
+// CHECK-NEXT:        %194 = arith.addi %193, %190 : i64
+// CHECK-NEXT:        %195 = "llvm.inttoptr"(%194) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%178) ({
+// CHECK-NEXT:          %196 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %197 = memref.subview %196[2, 2, 102] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22362>>
+// CHECK-NEXT:          "memref.copy"(%197, %send_buff_ex4) : (memref<51x101xf32, strided<[11025, 105], offset: 22362>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          %198 = func.call @MPI_Isend(%send_buff_ex4_ptr, %21, %12, %181, %39, %4, %189) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %199 = func.call @MPI_Irecv(%recv_buff_ex4_ptr, %21, %12, %181, %39, %4, %195) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %290 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%290, %277) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %291 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%291, %284) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %200 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%200, %189) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%200, %195) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %292 = arith.constant 1 : i32
-// CHECK-NEXT:        %293 = arith.divui %9, %292 : i32
-// CHECK-NEXT:        %294 = arith.remui %9, %292 : i32
-// CHECK-NEXT:        %295 = arith.constant 1 : i32
-// CHECK-NEXT:        %296 = arith.divui %294, %295 : i32
-// CHECK-NEXT:        %297 = arith.remui %294, %295 : i32
-// CHECK-NEXT:        %298 = arith.constant 1 : i32
-// CHECK-NEXT:        %299 = arith.divui %297, %298 : i32
-// CHECK-NEXT:        %300 = arith.remui %297, %298 : i32
-// CHECK-NEXT:        %301 = arith.constant true
-// CHECK-NEXT:        %302 = arith.constant true
-// CHECK-NEXT:        %303 = arith.constant -1 : i32
-// CHECK-NEXT:        %304 = arith.addi %299, %303 : i32
-// CHECK-NEXT:        %305 = arith.constant 0 : i32
-// CHECK-NEXT:        %306 = arith.cmpi sge, %304, %305 : i32
-// CHECK-NEXT:        %307 = arith.andi %301, %302 : i1
-// CHECK-NEXT:        %308 = arith.andi %307, %306 : i1
-// CHECK-NEXT:        %309 = arith.constant 1 : i32
-// CHECK-NEXT:        %310 = arith.muli %309, %293 : i32
-// CHECK-NEXT:        %311 = arith.addi %304, %310 : i32
-// CHECK-NEXT:        %312 = arith.constant 1 : i32
-// CHECK-NEXT:        %313 = arith.muli %312, %296 : i32
-// CHECK-NEXT:        %314 = arith.addi %311, %313 : i32
-// CHECK-NEXT:        %315 = arith.constant 5 : i32
-// CHECK-NEXT:        %316 = arith.constant 11 : i32
-// CHECK-NEXT:        %317 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %318 = arith.constant 4 : i64
-// CHECK-NEXT:        %319 = arith.index_cast %315 : i32 to index
-// CHECK-NEXT:        %320 = arith.index_cast %319 : index to i64
-// CHECK-NEXT:        %321 = arith.muli %318, %320 : i64
-// CHECK-NEXT:        %322 = arith.addi %321, %317 : i64
-// CHECK-NEXT:        %323 = "llvm.inttoptr"(%322) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %324 = "llvm.ptrtoint"(%4) : (!llvm.ptr) -> i64
-// CHECK-NEXT:        %325 = arith.constant 4 : i64
-// CHECK-NEXT:        %326 = arith.index_cast %316 : i32 to index
-// CHECK-NEXT:        %327 = arith.index_cast %326 : index to i64
-// CHECK-NEXT:        %328 = arith.muli %325, %327 : i64
-// CHECK-NEXT:        %329 = arith.addi %328, %324 : i64
-// CHECK-NEXT:        %330 = "llvm.inttoptr"(%329) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        "scf.if"(%308) ({
-// CHECK-NEXT:          %331 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, 0>, "static_sizes" = array<i64: 51, 101, 1>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 105], offset: 22262>>
-// CHECK-NEXT:          "memref.copy"(%331, %send_buff_ex5) : (memref<51x101xf32, strided<[11025, 105], offset: 22262>>, memref<51x101xf32>) -> ()
-// CHECK-NEXT:          %332 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %333 = func.call @MPI_Isend(%send_buff_ex5_ptr, %52, %53, %314, %61, %332, %323) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
-// CHECK-NEXT:          %334 = arith.constant 1140850688 : i32
-// CHECK-NEXT:          %335 = func.call @MPI_Irecv(%recv_buff_ex5_ptr, %56, %57, %314, %61, %334, %330) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:        %201 = arith.remui %8, %40 : i32
+// CHECK-NEXT:        %202 = arith.divui %201, %40 : i32
+// CHECK-NEXT:        %203 = arith.remui %201, %40 : i32
+// CHECK-NEXT:        %204 = arith.divui %203, %40 : i32
+// CHECK-NEXT:        %205 = arith.remui %203, %40 : i32
+// CHECK-NEXT:        %206 = arith.addi %204, %81 : i32
+// CHECK-NEXT:        %207 = arith.cmpi sge, %206, %39 : i32
+// CHECK-NEXT:        %208 = arith.andi %177, %207 : i1
+// CHECK-NEXT:        %209 = arith.addi %206, %117 : i32
+// CHECK-NEXT:        %210 = arith.muli %40, %202 : i32
+// CHECK-NEXT:        %211 = arith.addi %209, %210 : i32
+// CHECK-NEXT:        %212 = arith.constant 5 : i32
+// CHECK-NEXT:        %213 = arith.constant 11 : i32
+// CHECK-NEXT:        %214 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %215 = arith.index_cast %212 : i32 to index
+// CHECK-NEXT:        %216 = arith.index_cast %215 : index to i64
+// CHECK-NEXT:        %217 = arith.muli %59, %216 : i64
+// CHECK-NEXT:        %218 = arith.addi %217, %214 : i64
+// CHECK-NEXT:        %219 = "llvm.inttoptr"(%218) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %220 = "llvm.ptrtoint"(%3) : (!llvm.ptr) -> i64
+// CHECK-NEXT:        %221 = arith.index_cast %213 : i32 to index
+// CHECK-NEXT:        %222 = arith.index_cast %221 : index to i64
+// CHECK-NEXT:        %223 = arith.muli %59, %222 : i64
+// CHECK-NEXT:        %224 = arith.addi %223, %220 : i64
+// CHECK-NEXT:        %225 = "llvm.inttoptr"(%224) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        "scf.if"(%208) ({
+// CHECK-NEXT:          %226 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %227 = memref.subview %226[2, 2, 2] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22262>>
+// CHECK-NEXT:          "memref.copy"(%227, %send_buff_ex5) : (memref<51x101xf32, strided<[11025, 105], offset: 22262>>, memref<51x101xf32>) -> ()
+// CHECK-NEXT:          %228 = func.call @MPI_Isend(%send_buff_ex5_ptr, %21, %12, %211, %39, %4, %219) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
+// CHECK-NEXT:          %229 = func.call @MPI_Irecv(%recv_buff_ex5_ptr, %21, %12, %211, %39, %4, %225) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
-// CHECK-NEXT:          %336 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%336, %323) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:          %337 = arith.constant 738197504 : i32
-// CHECK-NEXT:          "llvm.store"(%337, %330) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          %230 = arith.constant 738197504 : i32
+// CHECK-NEXT:          "llvm.store"(%230, %219) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:          "llvm.store"(%230, %225) <{"ordering" = 0 : i64}> : (i32, !llvm.ptr) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %338 = arith.constant 1 : i64
-// CHECK-NEXT:        %339 = "llvm.inttoptr"(%338) : (i64) -> !llvm.ptr
-// CHECK-NEXT:        %340 = func.call @MPI_Waitall(%3, %4, %339) : (i32, !llvm.ptr, !llvm.ptr) -> i32
-// CHECK-NEXT:        "scf.if"(%78) ({
-// CHECK-NEXT:          %341 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 51, 0, 0>, "static_sizes" = array<i64: 1, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<101x101xf32, strided<[105, 1], offset: 584537>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex0, %341) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 584537>>) -> ()
+// CHECK-NEXT:        %231 = "llvm.inttoptr"(%5) : (i64) -> !llvm.ptr
+// CHECK-NEXT:        %232 = func.call @MPI_Waitall(%2, %3, %231) : (i32, !llvm.ptr, !llvm.ptr) -> i32
+// CHECK-NEXT:        "scf.if"(%52) ({
+// CHECK-NEXT:          %233 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %234 = memref.subview %233[53, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 584537>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex0, %234) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 584537>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        "scf.if"(%124) ({
-// CHECK-NEXT:          %342 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: -1, 0, 0>, "static_sizes" = array<i64: 1, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<101x101xf32, strided<[105, 1], offset: 11237>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex1, %342) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 11237>>) -> ()
+// CHECK-NEXT:        "scf.if"(%85) ({
+// CHECK-NEXT:          %235 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %236 = memref.subview %235[1, 2, 2] [1, 101, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<101x101xf32, strided<[105, 1], offset: 11237>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex1, %236) : (memref<101x101xf32>, memref<101x101xf32, strided<[105, 1], offset: 11237>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        "scf.if"(%170) ({
-// CHECK-NEXT:          %343 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 101, 0>, "static_sizes" = array<i64: 51, 1, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 1], offset: 32867>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex2, %343) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 32867>>) -> ()
+// CHECK-NEXT:        "scf.if"(%116) ({
+// CHECK-NEXT:          %237 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %238 = memref.subview %237[2, 103, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 32867>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex2, %238) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 32867>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        "scf.if"(%216) ({
-// CHECK-NEXT:          %344 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, -1, 0>, "static_sizes" = array<i64: 51, 1, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 1], offset: 22157>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex3, %344) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 22157>>) -> ()
+// CHECK-NEXT:        "scf.if"(%147) ({
+// CHECK-NEXT:          %239 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %240 = memref.subview %239[2, 1, 2] [51, 1, 101] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 1], offset: 22157>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex3, %240) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 1], offset: 22157>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        "scf.if"(%262) ({
-// CHECK-NEXT:          %345 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, 101>, "static_sizes" = array<i64: 51, 101, 1>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 105], offset: 22363>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex4, %345) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22363>>) -> ()
+// CHECK-NEXT:        "scf.if"(%178) ({
+// CHECK-NEXT:          %241 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %242 = memref.subview %241[2, 2, 103] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22363>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex4, %242) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22363>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        "scf.if"(%308) ({
-// CHECK-NEXT:          %346 = "memref.subview"(%u_t0_loadview) <{"static_offsets" = array<i64: 0, 0, -1>, "static_sizes" = array<i64: 51, 101, 1>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>) -> memref<51x101xf32, strided<[11025, 105], offset: 22261>>
-// CHECK-NEXT:          "memref.copy"(%recv_buff_ex5, %346) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22261>>) -> ()
+// CHECK-NEXT:        "scf.if"(%208) ({
+// CHECK-NEXT:          %243 = builtin.unrealized_conversion_cast %u_t0 : memref<55x105x105xf32> to memref<55x105x105xf32>
+// CHECK-NEXT:          %244 = memref.subview %243[2, 2, 1] [51, 101, 1] [1, 1, 1] : memref<55x105x105xf32> to memref<51x101xf32, strided<[11025, 105], offset: 22261>>
+// CHECK-NEXT:          "memref.copy"(%recv_buff_ex5, %244) : (memref<51x101xf32>, memref<51x101xf32, strided<[11025, 105], offset: 22261>>) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }, {
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (i1) -> ()
-// CHECK-NEXT:        %u_t2_loadview = "memref.subview"(%u_t2) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:        %347 = arith.constant 0 : index
-// CHECK-NEXT:        %348 = arith.constant 0 : index
-// CHECK-NEXT:        %349 = arith.constant 0 : index
-// CHECK-NEXT:        %350 = arith.constant 1 : index
-// CHECK-NEXT:        %351 = arith.constant 1 : index
-// CHECK-NEXT:        %352 = arith.constant 1 : index
-// CHECK-NEXT:        %353 = arith.constant 51 : index
-// CHECK-NEXT:        %354 = arith.constant 101 : index
-// CHECK-NEXT:        %355 = arith.constant 101 : index
-// CHECK-NEXT:        %356 = arith.constant 0 : index
-// CHECK-NEXT:        %357 = arith.constant 64 : index
-// CHECK-NEXT:        %358 = arith.constant 64 : index
-// CHECK-NEXT:        %359 = arith.muli %350, %357 : index
-// CHECK-NEXT:        %360 = arith.muli %351, %358 : index
-// CHECK-NEXT:        "scf.parallel"(%347, %348, %353, %354, %359, %360) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
-// CHECK-NEXT:        ^0(%361 : index, %362 : index):
-// CHECK-NEXT:          %363 = "affine.min"(%357, %353, %361) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          %364 = "affine.min"(%358, %354, %362) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          "scf.parallel"(%356, %356, %349, %363, %364, %355, %350, %351, %352) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
-// CHECK-NEXT:          ^1(%365 : index, %366 : index, %367 : index):
-// CHECK-NEXT:            %368 = arith.addi %361, %365 : index
-// CHECK-NEXT:            %369 = arith.addi %362, %366 : index
+// CHECK-NEXT:        %245 = memref.subview %u_t1[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t0_blk = memref.subview %u_t0[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %u_t2_blk = memref.subview %u_t2[2, 2, 2] [55, 105, 105] [1, 1, 1] : memref<55x105x105xf32> to memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:        %246 = arith.constant 0 : index
+// CHECK-NEXT:        %247 = arith.constant 51 : index
+// CHECK-NEXT:        %248 = arith.constant 101 : index
+// CHECK-NEXT:        %249 = arith.constant 64 : index
+// CHECK-NEXT:        %250 = arith.muli %time_m, %249 : index
+// CHECK-NEXT:        "scf.parallel"(%246, %246, %247, %248, %250, %250) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
+// CHECK-NEXT:        ^0(%251 : index, %252 : index):
+// CHECK-NEXT:          %253 = "affine.min"(%249, %247, %251) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          %254 = "affine.min"(%249, %248, %252) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          "scf.parallel"(%246, %246, %246, %253, %254, %248, %time_m, %time_m, %time_m) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
+// CHECK-NEXT:          ^1(%255 : index, %256 : index, %257 : index):
+// CHECK-NEXT:            %258 = arith.addi %251, %255 : index
+// CHECK-NEXT:            %259 = arith.addi %252, %256 : index
 // CHECK-NEXT:            %dt = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %370 = arith.constant 2 : i64
-// CHECK-NEXT:            %371 = "math.fpowi"(%dt, %370) : (f32, i64) -> f32
-// CHECK-NEXT:            %372 = arith.constant -1 : i64
-// CHECK-NEXT:            %dt_1 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %373 = arith.constant -2 : i64
-// CHECK-NEXT:            %374 = "math.fpowi"(%dt_1, %373) : (f32, i64) -> f32
-// CHECK-NEXT:            %375 = memref.load %u_t2_loadview[%368, %369, %367] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %376 = arith.mulf %374, %375 : f32
-// CHECK-NEXT:            %377 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %dt_2 = arith.constant 1.000000e-04 : f32
-// CHECK-NEXT:            %378 = arith.constant -2 : i64
-// CHECK-NEXT:            %379 = "math.fpowi"(%dt_2, %378) : (f32, i64) -> f32
-// CHECK-NEXT:            %380 = memref.load %u_t0_loadview[%368, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %381 = arith.mulf %377, %379 : f32
-// CHECK-NEXT:            %382 = arith.mulf %381, %380 : f32
-// CHECK-NEXT:            %383 = arith.addf %376, %382 : f32
-// CHECK-NEXT:            %384 = arith.sitofp %372 : i64 to f32
-// CHECK-NEXT:            %385 = arith.mulf %384, %383 : f32
+// CHECK-NEXT:            %260 = arith.constant 2 : i64
+// CHECK-NEXT:            %261 = "math.fpowi"(%dt, %260) : (f32, i64) -> f32
+// CHECK-NEXT:            %262 = arith.constant -1 : i64
+// CHECK-NEXT:            %263 = arith.constant -2 : i64
+// CHECK-NEXT:            %264 = "math.fpowi"(%dt, %263) : (f32, i64) -> f32
+// CHECK-NEXT:            %265 = memref.load %u_t2_blk[%258, %259, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %266 = arith.mulf %264, %265 : f32
+// CHECK-NEXT:            %267 = arith.constant -2.000000e+00 : f32
+// CHECK-NEXT:            %268 = memref.load %u_t0_blk[%258, %259, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %269 = arith.mulf %267, %264 : f32
+// CHECK-NEXT:            %270 = arith.mulf %269, %268 : f32
+// CHECK-NEXT:            %271 = arith.addf %266, %270 : f32
+// CHECK-NEXT:            %272 = arith.sitofp %262 : i64 to f32
+// CHECK-NEXT:            %273 = arith.mulf %272, %271 : f32
 // CHECK-NEXT:            %h_x = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %386 = arith.constant -2 : i64
-// CHECK-NEXT:            %387 = "math.fpowi"(%h_x, %386) : (f32, i64) -> f32
-// CHECK-NEXT:            %388 = arith.constant -1 : index
-// CHECK-NEXT:            %389 = arith.addi %368, %388 : index
-// CHECK-NEXT:            %390 = memref.load %u_t0_loadview[%389, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %391 = arith.mulf %387, %390 : f32
-// CHECK-NEXT:            %h_x_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %392 = arith.constant -2 : i64
-// CHECK-NEXT:            %393 = "math.fpowi"(%h_x_1, %392) : (f32, i64) -> f32
-// CHECK-NEXT:            %394 = arith.constant 1 : index
-// CHECK-NEXT:            %395 = arith.addi %368, %394 : index
-// CHECK-NEXT:            %396 = memref.load %u_t0_loadview[%395, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %397 = arith.mulf %393, %396 : f32
-// CHECK-NEXT:            %398 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_x_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %399 = arith.constant -2 : i64
-// CHECK-NEXT:            %400 = "math.fpowi"(%h_x_2, %399) : (f32, i64) -> f32
-// CHECK-NEXT:            %401 = memref.load %u_t0_loadview[%368, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %402 = arith.mulf %398, %400 : f32
-// CHECK-NEXT:            %403 = arith.mulf %402, %401 : f32
-// CHECK-NEXT:            %404 = arith.addf %391, %397 : f32
-// CHECK-NEXT:            %405 = arith.addf %404, %403 : f32
-// CHECK-NEXT:            %h_y = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %406 = arith.constant -2 : i64
-// CHECK-NEXT:            %407 = "math.fpowi"(%h_y, %406) : (f32, i64) -> f32
-// CHECK-NEXT:            %408 = arith.constant -1 : index
-// CHECK-NEXT:            %409 = arith.addi %369, %408 : index
-// CHECK-NEXT:            %410 = memref.load %u_t0_loadview[%368, %409, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %411 = arith.mulf %407, %410 : f32
-// CHECK-NEXT:            %h_y_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %412 = arith.constant -2 : i64
-// CHECK-NEXT:            %413 = "math.fpowi"(%h_y_1, %412) : (f32, i64) -> f32
-// CHECK-NEXT:            %414 = arith.constant 1 : index
-// CHECK-NEXT:            %415 = arith.addi %369, %414 : index
-// CHECK-NEXT:            %416 = memref.load %u_t0_loadview[%368, %415, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %417 = arith.mulf %413, %416 : f32
-// CHECK-NEXT:            %418 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_y_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %419 = arith.constant -2 : i64
-// CHECK-NEXT:            %420 = "math.fpowi"(%h_y_2, %419) : (f32, i64) -> f32
-// CHECK-NEXT:            %421 = memref.load %u_t0_loadview[%368, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %422 = arith.mulf %418, %420 : f32
-// CHECK-NEXT:            %423 = arith.mulf %422, %421 : f32
-// CHECK-NEXT:            %424 = arith.addf %411, %417 : f32
-// CHECK-NEXT:            %425 = arith.addf %424, %423 : f32
-// CHECK-NEXT:            %h_z = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %426 = arith.constant -2 : i64
-// CHECK-NEXT:            %427 = "math.fpowi"(%h_z, %426) : (f32, i64) -> f32
-// CHECK-NEXT:            %428 = arith.constant -1 : index
-// CHECK-NEXT:            %429 = arith.addi %367, %428 : index
-// CHECK-NEXT:            %430 = memref.load %u_t0_loadview[%368, %369, %429] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %431 = arith.mulf %427, %430 : f32
-// CHECK-NEXT:            %h_z_1 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %432 = arith.constant -2 : i64
-// CHECK-NEXT:            %433 = "math.fpowi"(%h_z_1, %432) : (f32, i64) -> f32
-// CHECK-NEXT:            %434 = arith.constant 1 : index
-// CHECK-NEXT:            %435 = arith.addi %367, %434 : index
-// CHECK-NEXT:            %436 = memref.load %u_t0_loadview[%368, %369, %435] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %437 = arith.mulf %433, %436 : f32
-// CHECK-NEXT:            %438 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_z_2 = arith.constant 1.000000e-02 : f32
-// CHECK-NEXT:            %439 = arith.constant -2 : i64
-// CHECK-NEXT:            %440 = "math.fpowi"(%h_z_2, %439) : (f32, i64) -> f32
-// CHECK-NEXT:            %441 = memref.load %u_t0_loadview[%368, %369, %367] : memref<53x103x103xf32, strided<[11025, 105, 1], offset: 22262>>
-// CHECK-NEXT:            %442 = arith.mulf %438, %440 : f32
-// CHECK-NEXT:            %443 = arith.mulf %442, %441 : f32
-// CHECK-NEXT:            %444 = arith.addf %431, %437 : f32
-// CHECK-NEXT:            %445 = arith.addf %444, %443 : f32
-// CHECK-NEXT:            %446 = arith.addf %385, %405 : f32
-// CHECK-NEXT:            %447 = arith.addf %446, %425 : f32
-// CHECK-NEXT:            %448 = arith.addf %447, %445 : f32
-// CHECK-NEXT:            %449 = arith.mulf %371, %448 : f32
-// CHECK-NEXT:            memref.store %449, %u_t1_storeview[%368, %369, %367] : memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %274 = "math.fpowi"(%h_x, %263) : (f32, i64) -> f32
+// CHECK-NEXT:            %275 = arith.constant -1 : index
+// CHECK-NEXT:            %276 = arith.addi %258, %275 : index
+// CHECK-NEXT:            %277 = memref.load %u_t0_blk[%276, %259, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %278 = arith.mulf %274, %277 : f32
+// CHECK-NEXT:            %279 = arith.addi %258, %time_m : index
+// CHECK-NEXT:            %280 = memref.load %u_t0_blk[%279, %259, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %281 = arith.mulf %274, %280 : f32
+// CHECK-NEXT:            %282 = arith.mulf %267, %274 : f32
+// CHECK-NEXT:            %283 = arith.mulf %282, %268 : f32
+// CHECK-NEXT:            %284 = arith.addf %278, %281 : f32
+// CHECK-NEXT:            %285 = arith.addf %284, %283 : f32
+// CHECK-NEXT:            %286 = arith.addi %259, %275 : index
+// CHECK-NEXT:            %287 = memref.load %u_t0_blk[%258, %286, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %288 = arith.mulf %274, %287 : f32
+// CHECK-NEXT:            %289 = arith.addi %259, %time_m : index
+// CHECK-NEXT:            %290 = memref.load %u_t0_blk[%258, %289, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %291 = arith.mulf %274, %290 : f32
+// CHECK-NEXT:            %292 = arith.addf %288, %291 : f32
+// CHECK-NEXT:            %293 = arith.addf %292, %283 : f32
+// CHECK-NEXT:            %294 = arith.addi %257, %275 : index
+// CHECK-NEXT:            %295 = memref.load %u_t0_blk[%258, %259, %294] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %296 = arith.mulf %274, %295 : f32
+// CHECK-NEXT:            %297 = arith.addi %257, %time_m : index
+// CHECK-NEXT:            %298 = memref.load %u_t0_blk[%258, %259, %297] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
+// CHECK-NEXT:            %299 = arith.mulf %274, %298 : f32
+// CHECK-NEXT:            %300 = arith.addf %296, %299 : f32
+// CHECK-NEXT:            %301 = arith.addf %300, %283 : f32
+// CHECK-NEXT:            %302 = arith.addf %273, %285 : f32
+// CHECK-NEXT:            %303 = arith.addf %302, %293 : f32
+// CHECK-NEXT:            %304 = arith.addf %303, %301 : f32
+// CHECK-NEXT:            %305 = arith.mulf %261, %304 : f32
+// CHECK-NEXT:            memref.store %305, %245[%258, %259, %257] : memref<55x105x105xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:            scf.yield
 // CHECK-NEXT:          }) : (index, index, index, index, index, index, index, index, index) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (index, index, index, index, index, index) -> ()
-// CHECK-NEXT:        %u_t1_temp = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 2, 2, 2>, "static_sizes" = array<i64: 51, 101, 101>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<55x105x105xf32>) -> memref<51x101x101xf32, strided<[11025, 105, 1], offset: 22262>>
 // CHECK-NEXT:        scf.yield %u_t1, %u_t2, %u_t0 : memref<55x105x105xf32>, memref<55x105x105xf32>, memref<55x105x105xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %450 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%450, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %306 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%306, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
@@ -698,4 +541,4 @@ builtin.module {
 // CHECK-NEXT:    func.func private @MPI_Isend(!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:    func.func private @MPI_Irecv(!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32
 // CHECK-NEXT:    func.func private @MPI_Waitall(i32, !llvm.ptr, !llvm.ptr) -> i32
-// CHECK-NEXT:  }
\ No newline at end of file
+// CHECK-NEXT:  }
diff --git a/tests/filecheck/xdsl_pipeline.mlir b/tests/filecheck/xdsl_pipeline.mlir
index f2981a3a17..207eb73d56 100644
--- a/tests/filecheck/xdsl_pipeline.mlir
+++ b/tests/filecheck/xdsl_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: xdsl-opt -p canonicalize,cse,shape-inference,stencil-bufferize,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize %s | filecheck %s
+// RUN: xdsl-opt -p "canonicalize,cse,shape-inference,stencil-bufferize,convert-stencil-to-ll-mlir,scf-parallel-loop-tiling{parallel-loop-tile-sizes=64,0},printf-to-llvm,canonicalize,cse" %s | filecheck %s
 
 builtin.module {
   func.func @Kernel(%f2_vec0 : !stencil.field<[-2,5]x[-2,5]xf32>, %f2_vec1 : !stencil.field<[-2,5]x[-2,5]xf32>, %timers : !llvm.ptr) {
@@ -81,98 +81,63 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 0 : index
 // CHECK-NEXT:      %time_M = arith.constant 1 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
-// CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4 = scf.for %time = %time_m to %2 step %step iter_args(%f2_t0 = %f2_vec0, %f2_t1 = %f2_vec1) -> (memref<7x7xf32>, memref<7x7xf32>) {
-// CHECK-NEXT:        %f2_t1_storeview = "memref.subview"(%f2_t1) <{"static_offsets" = array<i64: 2, 2>, "static_sizes" = array<i64: 3, 3>, "static_strides" = array<i64: 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<7x7xf32>) -> memref<3x3xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:        %f2_t0_loadview = "memref.subview"(%f2_t0) <{"static_offsets" = array<i64: 2, 2>, "static_sizes" = array<i64: 5, 5>, "static_strides" = array<i64: 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<7x7xf32>) -> memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:        %5 = arith.constant 0 : index
-// CHECK-NEXT:        %6 = arith.constant 0 : index
-// CHECK-NEXT:        %7 = arith.constant 1 : index
-// CHECK-NEXT:        %8 = arith.constant 1 : index
-// CHECK-NEXT:        %9 = arith.constant 3 : index
-// CHECK-NEXT:        %10 = arith.constant 3 : index
-// CHECK-NEXT:        %11 = arith.constant 0 : index
-// CHECK-NEXT:        %12 = arith.constant 64 : index
-// CHECK-NEXT:        %13 = arith.muli %7, %12 : index
-// CHECK-NEXT:        "scf.parallel"(%5, %9, %13) <{"operandSegmentSizes" = array<i32: 1, 1, 1, 0>}> ({
-// CHECK-NEXT:        ^0(%14 : index):
-// CHECK-NEXT:          %15 = "affine.min"(%12, %9, %14) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          "scf.parallel"(%11, %6, %15, %10, %7, %8) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
-// CHECK-NEXT:          ^1(%16 : index, %17 : index):
-// CHECK-NEXT:            %18 = arith.addi %14, %16 : index
-// CHECK-NEXT:            %19 = arith.constant 5.000000e-01 : f32
+// CHECK-NEXT:      %1 = arith.addi %time_M, %time_M : index
+// CHECK-NEXT:      %2, %3 = scf.for %time = %time_m to %1 step %time_M iter_args(%f2_t0 = %f2_vec0, %f2_t1 = %f2_vec1) -> (memref<7x7xf32>, memref<7x7xf32>) {
+// CHECK-NEXT:        %4 = memref.subview %f2_t1[2, 2] [7, 7] [1, 1] : memref<7x7xf32> to memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:        %f2_t0_blk = memref.subview %f2_t0[2, 2] [7, 7] [1, 1] : memref<7x7xf32> to memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:        %5 = arith.constant 3 : index
+// CHECK-NEXT:        %6 = arith.constant 64 : index
+// CHECK-NEXT:        %7 = arith.muli %time_M, %6 : index
+// CHECK-NEXT:        "scf.parallel"(%time_m, %5, %7) <{"operandSegmentSizes" = array<i32: 1, 1, 1, 0>}> ({
+// CHECK-NEXT:        ^0(%8 : index):
+// CHECK-NEXT:          %9 = "affine.min"(%6, %5, %8) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          "scf.parallel"(%time_m, %time_m, %9, %5, %time_M, %time_M) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
+// CHECK-NEXT:          ^1(%10 : index, %11 : index):
+// CHECK-NEXT:            %12 = arith.addi %8, %10 : index
 // CHECK-NEXT:            %h_x = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %20 = arith.constant -2 : i64
-// CHECK-NEXT:            %21 = "math.fpowi"(%h_x, %20) : (f32, i64) -> f32
-// CHECK-NEXT:            %22 = arith.constant -1 : index
-// CHECK-NEXT:            %23 = arith.addi %18, %22 : index
-// CHECK-NEXT:            %24 = memref.load %f2_t0_loadview[%23, %17] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %25 = arith.mulf %21, %24 : f32
-// CHECK-NEXT:            %h_x_1 = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %26 = arith.constant -2 : i64
-// CHECK-NEXT:            %27 = "math.fpowi"(%h_x_1, %26) : (f32, i64) -> f32
-// CHECK-NEXT:            %28 = arith.constant 1 : index
-// CHECK-NEXT:            %29 = arith.addi %18, %28 : index
-// CHECK-NEXT:            %30 = memref.load %f2_t0_loadview[%29, %17] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %31 = arith.mulf %27, %30 : f32
-// CHECK-NEXT:            %32 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_x_2 = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %33 = arith.constant -2 : i64
-// CHECK-NEXT:            %34 = "math.fpowi"(%h_x_2, %33) : (f32, i64) -> f32
-// CHECK-NEXT:            %35 = memref.load %f2_t0_loadview[%18, %17] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %36 = arith.mulf %32, %34 : f32
-// CHECK-NEXT:            %37 = arith.mulf %36, %35 : f32
-// CHECK-NEXT:            %38 = arith.addf %25, %31 : f32
-// CHECK-NEXT:            %39 = arith.addf %38, %37 : f32
-// CHECK-NEXT:            %40 = arith.mulf %19, %39 : f32
-// CHECK-NEXT:            %41 = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %h_y = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %42 = arith.constant -2 : i64
-// CHECK-NEXT:            %43 = "math.fpowi"(%h_y, %42) : (f32, i64) -> f32
-// CHECK-NEXT:            %44 = arith.constant -1 : index
-// CHECK-NEXT:            %45 = arith.addi %17, %44 : index
-// CHECK-NEXT:            %46 = memref.load %f2_t0_loadview[%18, %45] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %47 = arith.mulf %43, %46 : f32
-// CHECK-NEXT:            %h_y_1 = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %48 = arith.constant -2 : i64
-// CHECK-NEXT:            %49 = "math.fpowi"(%h_y_1, %48) : (f32, i64) -> f32
-// CHECK-NEXT:            %50 = arith.constant 1 : index
-// CHECK-NEXT:            %51 = arith.addi %17, %50 : index
-// CHECK-NEXT:            %52 = memref.load %f2_t0_loadview[%18, %51] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %53 = arith.mulf %49, %52 : f32
-// CHECK-NEXT:            %54 = arith.constant -2.000000e+00 : f32
-// CHECK-NEXT:            %h_y_2 = arith.constant 5.000000e-01 : f32
-// CHECK-NEXT:            %55 = arith.constant -2 : i64
-// CHECK-NEXT:            %56 = "math.fpowi"(%h_y_2, %55) : (f32, i64) -> f32
-// CHECK-NEXT:            %57 = memref.load %f2_t0_loadview[%18, %17] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %58 = arith.mulf %54, %56 : f32
-// CHECK-NEXT:            %59 = arith.mulf %58, %57 : f32
-// CHECK-NEXT:            %60 = arith.addf %47, %53 : f32
-// CHECK-NEXT:            %61 = arith.addf %60, %59 : f32
-// CHECK-NEXT:            %62 = arith.mulf %41, %61 : f32
+// CHECK-NEXT:            %13 = arith.constant -2 : i64
+// CHECK-NEXT:            %14 = "math.fpowi"(%h_x, %13) : (f32, i64) -> f32
+// CHECK-NEXT:            %15 = arith.constant -1 : index
+// CHECK-NEXT:            %16 = arith.addi %12, %15 : index
+// CHECK-NEXT:            %17 = memref.load %f2_t0_blk[%16, %11] : memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %18 = arith.mulf %14, %17 : f32
+// CHECK-NEXT:            %19 = arith.addi %12, %time_M : index
+// CHECK-NEXT:            %20 = memref.load %f2_t0_blk[%19, %11] : memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %21 = arith.mulf %14, %20 : f32
+// CHECK-NEXT:            %22 = arith.constant -2.000000e+00 : f32
+// CHECK-NEXT:            %23 = memref.load %f2_t0_blk[%12, %11] : memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %24 = arith.mulf %22, %14 : f32
+// CHECK-NEXT:            %25 = arith.mulf %24, %23 : f32
+// CHECK-NEXT:            %26 = arith.addf %18, %21 : f32
+// CHECK-NEXT:            %27 = arith.addf %26, %25 : f32
+// CHECK-NEXT:            %28 = arith.mulf %h_x, %27 : f32
+// CHECK-NEXT:            %29 = arith.addi %11, %15 : index
+// CHECK-NEXT:            %30 = memref.load %f2_t0_blk[%12, %29] : memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %31 = arith.mulf %14, %30 : f32
+// CHECK-NEXT:            %32 = arith.addi %11, %time_M : index
+// CHECK-NEXT:            %33 = memref.load %f2_t0_blk[%12, %32] : memref<7x7xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %34 = arith.mulf %14, %33 : f32
+// CHECK-NEXT:            %35 = arith.addf %31, %34 : f32
+// CHECK-NEXT:            %36 = arith.addf %35, %25 : f32
+// CHECK-NEXT:            %37 = arith.mulf %h_x, %36 : f32
 // CHECK-NEXT:            %dt = arith.constant 1.000000e-01 : f32
-// CHECK-NEXT:            %63 = arith.constant -1 : i64
-// CHECK-NEXT:            %64 = "math.fpowi"(%dt, %63) : (f32, i64) -> f32
-// CHECK-NEXT:            %65 = memref.load %f2_t0_loadview[%18, %17] : memref<5x5xf32, strided<[7, 1], offset: 16>>
-// CHECK-NEXT:            %66 = arith.mulf %64, %65 : f32
-// CHECK-NEXT:            %67 = arith.addf %40, %62 : f32
-// CHECK-NEXT:            %68 = arith.addf %67, %66 : f32
-// CHECK-NEXT:            %dt_1 = arith.constant 1.000000e-01 : f32
-// CHECK-NEXT:            %69 = arith.mulf %68, %dt_1 : f32
-// CHECK-NEXT:            memref.store %69, %f2_t1_storeview[%18, %17] : memref<3x3xf32, strided<[7, 1], offset: 16>>
+// CHECK-NEXT:            %38 = arith.constant -1 : i64
+// CHECK-NEXT:            %39 = "math.fpowi"(%dt, %38) : (f32, i64) -> f32
+// CHECK-NEXT:            %40 = arith.mulf %39, %23 : f32
+// CHECK-NEXT:            %41 = arith.addf %28, %37 : f32
+// CHECK-NEXT:            %42 = arith.addf %41, %40 : f32
+// CHECK-NEXT:            %43 = arith.mulf %42, %dt : f32
+// CHECK-NEXT:            memref.store %43, %4[%12, %11] : memref<7x7xf32, strided<[7, 1], offset: 16>>
 // CHECK-NEXT:            scf.yield
 // CHECK-NEXT:          }) : (index, index, index, index, index, index) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (index, index, index) -> ()
 // CHECK-NEXT:        scf.yield %f2_t1, %f2_t0 : memref<7x7xf32>, memref<7x7xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %70 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%70, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %44 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%44, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64
 // CHECK-NEXT:    func.func private @timer_end(f64) -> f64
 // CHECK-NEXT:  }
-// CHECK-NEXT:  
diff --git a/tests/filecheck/xdsl_pipeline_openmp.mlir b/tests/filecheck/xdsl_pipeline_openmp.mlir
index 127763e598..76394b2249 100644
--- a/tests/filecheck/xdsl_pipeline_openmp.mlir
+++ b/tests/filecheck/xdsl_pipeline_openmp.mlir
@@ -207,196 +207,108 @@ builtin.module {
 // CHECK-NEXT:      %0 = func.call @timer_start() : () -> f64
 // CHECK-NEXT:      %time_m = arith.constant 0 : index
 // CHECK-NEXT:      %time_M = arith.constant 250 : index
-// CHECK-NEXT:      %1 = arith.constant 1 : index
-// CHECK-NEXT:      %2 = arith.addi %time_M, %1 : index
 // CHECK-NEXT:      %step = arith.constant 1 : index
-// CHECK-NEXT:      %3, %4 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1) -> (memref<158x158x158xf32>, memref<158x158x158xf32>) {
-// CHECK-NEXT:        %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array<i64: 4, 4, 4>, "static_sizes" = array<i64: 150, 150, 150>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<158x158x158xf32>) -> memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:        %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array<i64: 4, 4, 4>, "static_sizes" = array<i64: 154, 154, 154>, "static_strides" = array<i64: 1, 1, 1>, "operandSegmentSizes" = array<i32: 1, 0, 0, 0>}> : (memref<158x158x158xf32>) -> memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:        %5 = arith.constant 0 : index
-// CHECK-NEXT:        %6 = arith.constant 0 : index
-// CHECK-NEXT:        %7 = arith.constant 0 : index
-// CHECK-NEXT:        %8 = arith.constant 1 : index
-// CHECK-NEXT:        %9 = arith.constant 1 : index
-// CHECK-NEXT:        %10 = arith.constant 1 : index
-// CHECK-NEXT:        %11 = arith.constant 150 : index
-// CHECK-NEXT:        %12 = arith.constant 150 : index
-// CHECK-NEXT:        %13 = arith.constant 150 : index
-// CHECK-NEXT:        %14 = arith.constant 0 : index
-// CHECK-NEXT:        %15 = arith.constant 64 : index
-// CHECK-NEXT:        %16 = arith.constant 64 : index
-// CHECK-NEXT:        %17 = arith.muli %8, %15 : index
-// CHECK-NEXT:        %18 = arith.muli %9, %16 : index
-// CHECK-NEXT:        "scf.parallel"(%5, %6, %11, %12, %17, %18) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
-// CHECK-NEXT:        ^0(%19 : index, %20 : index):
-// CHECK-NEXT:          %21 = "affine.min"(%15, %11, %19) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          %22 = "affine.min"(%16, %12, %20) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
-// CHECK-NEXT:          "scf.parallel"(%14, %14, %7, %21, %22, %13, %8, %9, %10) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
-// CHECK-NEXT:          ^1(%23 : index, %24 : index, %25 : index):
-// CHECK-NEXT:            %26 = arith.addi %19, %23 : index
-// CHECK-NEXT:            %27 = arith.addi %20, %24 : index
+// CHECK-NEXT:      %1 = arith.addi %time_M, %step : index
+// CHECK-NEXT:      %2, %3 = scf.for %time = %time_m to %1 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1) -> (memref<158x158x158xf32>, memref<158x158x158xf32>) {
+// CHECK-NEXT:        %u_t1_storeview = memref.subview %u_t1[4, 4, 4] [150, 150, 150] [1, 1, 1] : memref<158x158x158xf32> to memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:        %u_t0_loadview = memref.subview %u_t0[4, 4, 4] [154, 154, 154] [1, 1, 1] : memref<158x158x158xf32> to memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:        %4 = arith.constant 150 : index
+// CHECK-NEXT:        %5 = arith.constant 64 : index
+// CHECK-NEXT:        %6 = arith.muli %step, %5 : index
+// CHECK-NEXT:        "scf.parallel"(%time_m, %time_m, %4, %4, %6, %6) <{"operandSegmentSizes" = array<i32: 2, 2, 2, 0>}> ({
+// CHECK-NEXT:        ^0(%7 : index, %8 : index):
+// CHECK-NEXT:          %9 = "affine.min"(%5, %4, %7) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          %10 = "affine.min"(%5, %4, %8) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index
+// CHECK-NEXT:          "scf.parallel"(%time_m, %time_m, %time_m, %9, %10, %4, %step, %step, %step) <{"operandSegmentSizes" = array<i32: 3, 3, 3, 0>}> ({
+// CHECK-NEXT:          ^1(%11 : index, %12 : index, %13 : index):
+// CHECK-NEXT:            %14 = arith.addi %7, %11 : index
+// CHECK-NEXT:            %15 = arith.addi %8, %12 : index
 // CHECK-NEXT:            %dt = arith.constant 6.717825e-07 : f32
-// CHECK-NEXT:            %28 = arith.constant -1 : i64
-// CHECK-NEXT:            %29 = "math.fpowi"(%dt, %28) : (f32, i64) -> f32
-// CHECK-NEXT:            %30 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %31 = arith.mulf %29, %30 : f32
-// CHECK-NEXT:            %32 = arith.constant 1.333333e+00 : f32
+// CHECK-NEXT:            %16 = arith.constant -1 : i64
+// CHECK-NEXT:            %17 = "math.fpowi"(%dt, %16) : (f32, i64) -> f32
+// CHECK-NEXT:            %18 = memref.load %u_t0_loadview[%14, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %19 = arith.mulf %17, %18 : f32
+// CHECK-NEXT:            %20 = arith.constant 1.333333e+00 : f32
 // CHECK-NEXT:            %h_x = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %33 = arith.constant -2 : i64
-// CHECK-NEXT:            %34 = "math.fpowi"(%h_x, %33) : (f32, i64) -> f32
-// CHECK-NEXT:            %35 = arith.constant -1 : index
-// CHECK-NEXT:            %36 = arith.addi %26, %35 : index
-// CHECK-NEXT:            %37 = memref.load %u_t0_loadview[%36, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %38 = arith.mulf %32, %34 : f32
-// CHECK-NEXT:            %39 = arith.mulf %38, %37 : f32
-// CHECK-NEXT:            %40 = arith.constant 1.333333e+00 : f32
-// CHECK-NEXT:            %h_x_1 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %41 = arith.constant -2 : i64
-// CHECK-NEXT:            %42 = "math.fpowi"(%h_x_1, %41) : (f32, i64) -> f32
-// CHECK-NEXT:            %43 = arith.constant 1 : index
-// CHECK-NEXT:            %44 = arith.addi %26, %43 : index
-// CHECK-NEXT:            %45 = memref.load %u_t0_loadview[%44, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %46 = arith.mulf %40, %42 : f32
-// CHECK-NEXT:            %47 = arith.mulf %46, %45 : f32
-// CHECK-NEXT:            %48 = arith.constant -2.500000e+00 : f32
-// CHECK-NEXT:            %h_x_2 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %49 = arith.constant -2 : i64
-// CHECK-NEXT:            %50 = "math.fpowi"(%h_x_2, %49) : (f32, i64) -> f32
-// CHECK-NEXT:            %51 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %52 = arith.mulf %48, %50 : f32
-// CHECK-NEXT:            %53 = arith.mulf %52, %51 : f32
-// CHECK-NEXT:            %54 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_x_3 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %55 = arith.constant -2 : i64
-// CHECK-NEXT:            %56 = "math.fpowi"(%h_x_3, %55) : (f32, i64) -> f32
-// CHECK-NEXT:            %57 = arith.constant -2 : index
-// CHECK-NEXT:            %58 = arith.addi %26, %57 : index
-// CHECK-NEXT:            %59 = memref.load %u_t0_loadview[%58, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %60 = arith.mulf %54, %56 : f32
-// CHECK-NEXT:            %61 = arith.mulf %60, %59 : f32
-// CHECK-NEXT:            %62 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_x_4 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %63 = arith.constant -2 : i64
-// CHECK-NEXT:            %64 = "math.fpowi"(%h_x_4, %63) : (f32, i64) -> f32
-// CHECK-NEXT:            %65 = arith.constant 2 : index
-// CHECK-NEXT:            %66 = arith.addi %26, %65 : index
-// CHECK-NEXT:            %67 = memref.load %u_t0_loadview[%66, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %68 = arith.mulf %62, %64 : f32
-// CHECK-NEXT:            %69 = arith.mulf %68, %67 : f32
-// CHECK-NEXT:            %70 = arith.addf %39, %47 : f32
-// CHECK-NEXT:            %71 = arith.addf %70, %53 : f32
-// CHECK-NEXT:            %72 = arith.addf %71, %61 : f32
-// CHECK-NEXT:            %73 = arith.addf %72, %69 : f32
-// CHECK-NEXT:            %74 = arith.constant 1.333333e+00 : f32
-// CHECK-NEXT:            %h_y = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %75 = arith.constant -2 : i64
-// CHECK-NEXT:            %76 = "math.fpowi"(%h_y, %75) : (f32, i64) -> f32
-// CHECK-NEXT:            %77 = arith.constant -1 : index
-// CHECK-NEXT:            %78 = arith.addi %27, %77 : index
-// CHECK-NEXT:            %79 = memref.load %u_t0_loadview[%26, %78, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %80 = arith.mulf %74, %76 : f32
-// CHECK-NEXT:            %81 = arith.mulf %80, %79 : f32
-// CHECK-NEXT:            %82 = arith.constant 1.333333e+00 : f32
-// CHECK-NEXT:            %h_y_1 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %83 = arith.constant -2 : i64
-// CHECK-NEXT:            %84 = "math.fpowi"(%h_y_1, %83) : (f32, i64) -> f32
-// CHECK-NEXT:            %85 = arith.constant 1 : index
-// CHECK-NEXT:            %86 = arith.addi %27, %85 : index
-// CHECK-NEXT:            %87 = memref.load %u_t0_loadview[%26, %86, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %88 = arith.mulf %82, %84 : f32
-// CHECK-NEXT:            %89 = arith.mulf %88, %87 : f32
-// CHECK-NEXT:            %90 = arith.constant -2.500000e+00 : f32
-// CHECK-NEXT:            %h_y_2 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %91 = arith.constant -2 : i64
-// CHECK-NEXT:            %92 = "math.fpowi"(%h_y_2, %91) : (f32, i64) -> f32
-// CHECK-NEXT:            %93 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %94 = arith.mulf %90, %92 : f32
-// CHECK-NEXT:            %95 = arith.mulf %94, %93 : f32
-// CHECK-NEXT:            %96 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_y_3 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %97 = arith.constant -2 : i64
-// CHECK-NEXT:            %98 = "math.fpowi"(%h_y_3, %97) : (f32, i64) -> f32
-// CHECK-NEXT:            %99 = arith.constant -2 : index
-// CHECK-NEXT:            %100 = arith.addi %27, %99 : index
-// CHECK-NEXT:            %101 = memref.load %u_t0_loadview[%26, %100, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %102 = arith.mulf %96, %98 : f32
-// CHECK-NEXT:            %103 = arith.mulf %102, %101 : f32
-// CHECK-NEXT:            %104 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_y_4 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %105 = arith.constant -2 : i64
-// CHECK-NEXT:            %106 = "math.fpowi"(%h_y_4, %105) : (f32, i64) -> f32
-// CHECK-NEXT:            %107 = arith.constant 2 : index
-// CHECK-NEXT:            %108 = arith.addi %27, %107 : index
-// CHECK-NEXT:            %109 = memref.load %u_t0_loadview[%26, %108, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %110 = arith.mulf %104, %106 : f32
-// CHECK-NEXT:            %111 = arith.mulf %110, %109 : f32
-// CHECK-NEXT:            %112 = arith.addf %81, %89 : f32
-// CHECK-NEXT:            %113 = arith.addf %112, %95 : f32
-// CHECK-NEXT:            %114 = arith.addf %113, %103 : f32
-// CHECK-NEXT:            %115 = arith.addf %114, %111 : f32
-// CHECK-NEXT:            %116 = arith.constant 1.333333e+00 : f32
-// CHECK-NEXT:            %h_z = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %117 = arith.constant -2 : i64
-// CHECK-NEXT:            %118 = "math.fpowi"(%h_z, %117) : (f32, i64) -> f32
-// CHECK-NEXT:            %119 = arith.constant -1 : index
-// CHECK-NEXT:            %120 = arith.addi %25, %119 : index
-// CHECK-NEXT:            %121 = memref.load %u_t0_loadview[%26, %27, %120] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %122 = arith.mulf %116, %118 : f32
-// CHECK-NEXT:            %123 = arith.mulf %122, %121 : f32
-// CHECK-NEXT:            %124 = arith.constant 1.333333e+00 : f32
-// CHECK-NEXT:            %h_z_1 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %125 = arith.constant -2 : i64
-// CHECK-NEXT:            %126 = "math.fpowi"(%h_z_1, %125) : (f32, i64) -> f32
-// CHECK-NEXT:            %127 = arith.constant 1 : index
-// CHECK-NEXT:            %128 = arith.addi %25, %127 : index
-// CHECK-NEXT:            %129 = memref.load %u_t0_loadview[%26, %27, %128] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %130 = arith.mulf %124, %126 : f32
-// CHECK-NEXT:            %131 = arith.mulf %130, %129 : f32
-// CHECK-NEXT:            %132 = arith.constant -2.500000e+00 : f32
-// CHECK-NEXT:            %h_z_2 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %133 = arith.constant -2 : i64
-// CHECK-NEXT:            %134 = "math.fpowi"(%h_z_2, %133) : (f32, i64) -> f32
-// CHECK-NEXT:            %135 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %136 = arith.mulf %132, %134 : f32
-// CHECK-NEXT:            %137 = arith.mulf %136, %135 : f32
-// CHECK-NEXT:            %138 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_z_3 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %139 = arith.constant -2 : i64
-// CHECK-NEXT:            %140 = "math.fpowi"(%h_z_3, %139) : (f32, i64) -> f32
-// CHECK-NEXT:            %141 = arith.constant -2 : index
-// CHECK-NEXT:            %142 = arith.addi %25, %141 : index
-// CHECK-NEXT:            %143 = memref.load %u_t0_loadview[%26, %27, %142] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %144 = arith.mulf %138, %140 : f32
-// CHECK-NEXT:            %145 = arith.mulf %144, %143 : f32
-// CHECK-NEXT:            %146 = arith.constant -8.333333e-02 : f32
-// CHECK-NEXT:            %h_z_4 = arith.constant 1.342282e-02 : f32
-// CHECK-NEXT:            %147 = arith.constant -2 : i64
-// CHECK-NEXT:            %148 = "math.fpowi"(%h_z_4, %147) : (f32, i64) -> f32
-// CHECK-NEXT:            %149 = arith.constant 2 : index
-// CHECK-NEXT:            %150 = arith.addi %25, %149 : index
-// CHECK-NEXT:            %151 = memref.load %u_t0_loadview[%26, %27, %150] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
-// CHECK-NEXT:            %152 = arith.mulf %146, %148 : f32
-// CHECK-NEXT:            %153 = arith.mulf %152, %151 : f32
-// CHECK-NEXT:            %154 = arith.addf %123, %131 : f32
-// CHECK-NEXT:            %155 = arith.addf %154, %137 : f32
-// CHECK-NEXT:            %156 = arith.addf %155, %145 : f32
-// CHECK-NEXT:            %157 = arith.addf %156, %153 : f32
-// CHECK-NEXT:            %158 = arith.addf %73, %115 : f32
-// CHECK-NEXT:            %159 = arith.addf %158, %157 : f32
+// CHECK-NEXT:            %21 = arith.constant -2 : i64
+// CHECK-NEXT:            %22 = "math.fpowi"(%h_x, %21) : (f32, i64) -> f32
+// CHECK-NEXT:            %23 = arith.constant -1 : index
+// CHECK-NEXT:            %24 = arith.addi %14, %23 : index
+// CHECK-NEXT:            %25 = memref.load %u_t0_loadview[%24, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %26 = arith.mulf %20, %22 : f32
+// CHECK-NEXT:            %27 = arith.mulf %26, %25 : f32
+// CHECK-NEXT:            %28 = arith.addi %14, %step : index
+// CHECK-NEXT:            %29 = memref.load %u_t0_loadview[%28, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %30 = arith.mulf %26, %29 : f32
+// CHECK-NEXT:            %31 = arith.constant -2.500000e+00 : f32
+// CHECK-NEXT:            %32 = memref.load %u_t0_loadview[%14, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %33 = arith.mulf %31, %22 : f32
+// CHECK-NEXT:            %34 = arith.mulf %33, %32 : f32
+// CHECK-NEXT:            %35 = arith.constant -8.333333e-02 : f32
+// CHECK-NEXT:            %36 = arith.constant -2 : index
+// CHECK-NEXT:            %37 = arith.addi %14, %36 : index
+// CHECK-NEXT:            %38 = memref.load %u_t0_loadview[%37, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %39 = arith.mulf %35, %22 : f32
+// CHECK-NEXT:            %40 = arith.mulf %39, %38 : f32
+// CHECK-NEXT:            %41 = arith.constant 2 : index
+// CHECK-NEXT:            %42 = arith.addi %14, %41 : index
+// CHECK-NEXT:            %43 = memref.load %u_t0_loadview[%42, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %44 = arith.mulf %39, %43 : f32
+// CHECK-NEXT:            %45 = arith.addf %27, %30 : f32
+// CHECK-NEXT:            %46 = arith.addf %45, %34 : f32
+// CHECK-NEXT:            %47 = arith.addf %46, %40 : f32
+// CHECK-NEXT:            %48 = arith.addf %47, %44 : f32
+// CHECK-NEXT:            %49 = arith.addi %15, %23 : index
+// CHECK-NEXT:            %50 = memref.load %u_t0_loadview[%14, %49, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %51 = arith.mulf %26, %50 : f32
+// CHECK-NEXT:            %52 = arith.addi %15, %step : index
+// CHECK-NEXT:            %53 = memref.load %u_t0_loadview[%14, %52, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %54 = arith.mulf %26, %53 : f32
+// CHECK-NEXT:            %55 = memref.load %u_t0_loadview[%14, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %56 = arith.mulf %33, %55 : f32
+// CHECK-NEXT:            %57 = arith.addi %15, %36 : index
+// CHECK-NEXT:            %58 = memref.load %u_t0_loadview[%14, %57, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %59 = arith.mulf %39, %58 : f32
+// CHECK-NEXT:            %60 = arith.addi %15, %41 : index
+// CHECK-NEXT:            %61 = memref.load %u_t0_loadview[%14, %60, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %62 = arith.mulf %39, %61 : f32
+// CHECK-NEXT:            %63 = arith.addf %51, %54 : f32
+// CHECK-NEXT:            %64 = arith.addf %63, %56 : f32
+// CHECK-NEXT:            %65 = arith.addf %64, %59 : f32
+// CHECK-NEXT:            %66 = arith.addf %65, %62 : f32
+// CHECK-NEXT:            %67 = arith.addi %13, %23 : index
+// CHECK-NEXT:            %68 = memref.load %u_t0_loadview[%14, %15, %67] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %69 = arith.mulf %26, %68 : f32
+// CHECK-NEXT:            %70 = arith.addi %13, %step : index
+// CHECK-NEXT:            %71 = memref.load %u_t0_loadview[%14, %15, %70] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %72 = arith.mulf %26, %71 : f32
+// CHECK-NEXT:            %73 = memref.load %u_t0_loadview[%14, %15, %13] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %74 = arith.mulf %33, %73 : f32
+// CHECK-NEXT:            %75 = arith.addi %13, %36 : index
+// CHECK-NEXT:            %76 = memref.load %u_t0_loadview[%14, %15, %75] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %77 = arith.mulf %39, %76 : f32
+// CHECK-NEXT:            %78 = arith.addi %13, %41 : index
+// CHECK-NEXT:            %79 = memref.load %u_t0_loadview[%14, %15, %78] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %80 = arith.mulf %39, %79 : f32
+// CHECK-NEXT:            %81 = arith.addf %69, %72 : f32
+// CHECK-NEXT:            %82 = arith.addf %81, %74 : f32
+// CHECK-NEXT:            %83 = arith.addf %82, %77 : f32
+// CHECK-NEXT:            %84 = arith.addf %83, %80 : f32
+// CHECK-NEXT:            %85 = arith.addf %48, %66 : f32
+// CHECK-NEXT:            %86 = arith.addf %85, %84 : f32
 // CHECK-NEXT:            %a = arith.constant 9.000000e-01 : f32
-// CHECK-NEXT:            %160 = arith.mulf %159, %a : f32
-// CHECK-NEXT:            %161 = arith.addf %31, %160 : f32
-// CHECK-NEXT:            %dt_1 = arith.constant 6.717825e-07 : f32
-// CHECK-NEXT:            %162 = arith.mulf %161, %dt_1 : f32
-// CHECK-NEXT:            memref.store %162, %u_t1_storeview[%26, %27, %25] : memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>>
+// CHECK-NEXT:            %87 = arith.mulf %86, %a : f32
+// CHECK-NEXT:            %88 = arith.addf %19, %87 : f32
+// CHECK-NEXT:            %89 = arith.mulf %88, %dt : f32
+// CHECK-NEXT:            memref.store %89, %u_t1_storeview[%14, %15, %13] : memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>>
 // CHECK-NEXT:            scf.yield
 // CHECK-NEXT:          }) : (index, index, index, index, index, index, index, index, index) -> ()
 // CHECK-NEXT:          scf.yield
 // CHECK-NEXT:        }) : (index, index, index, index, index, index) -> ()
 // CHECK-NEXT:        scf.yield %u_t1, %u_t0 : memref<158x158x158xf32>, memref<158x158x158xf32>
 // CHECK-NEXT:      }
-// CHECK-NEXT:      %163 = func.call @timer_end(%0) : (f64) -> f64
-// CHECK-NEXT:      "llvm.store"(%163, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
+// CHECK-NEXT:      %90 = func.call @timer_end(%0) : (f64) -> f64
+// CHECK-NEXT:      "llvm.store"(%90, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> ()
 // CHECK-NEXT:      func.return
 // CHECK-NEXT:    }
 // CHECK-NEXT:    func.func private @timer_start() -> f64

From 9f1c990b313df933103fd985a4c802c7aa5993cc Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 20 Aug 2024 17:57:46 +0100
Subject: [PATCH 11/25] Update pytests.

---
 tests/test_xdsl_op_correctness.py | 10 ++++++----
 tests/test_xdsl_operator.py       |  5 +++--
 tests/test_xdsl_passes.py         |  5 ++++-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/test_xdsl_op_correctness.py b/tests/test_xdsl_op_correctness.py
index f0cb28d794..eb2a10c890 100644
--- a/tests/test_xdsl_op_correctness.py
+++ b/tests/test_xdsl_op_correctness.py
@@ -91,20 +91,22 @@ def test_u_and_v_conversion():
 
     scffor_ops = list(ops[6].regions[0].blocks[0].ops)
 
-    assert len(scffor_ops) == 7
+    assert len(scffor_ops) == 9
 
     # First
     assert isinstance(scffor_ops[0], LoadOp)
     assert isinstance(scffor_ops[1], LoadOp)
     assert isinstance(scffor_ops[2], ApplyOp)
     assert isinstance(scffor_ops[3], StoreOp)
+    assert isinstance(scffor_ops[4], LoadOp)
 
     # Second
-    assert isinstance(scffor_ops[4], ApplyOp)
-    assert isinstance(scffor_ops[5], StoreOp)
+    assert isinstance(scffor_ops[5], ApplyOp)
+    assert isinstance(scffor_ops[6], StoreOp)
+    assert isinstance(scffor_ops[7], LoadOp)
 
     # Yield
-    assert isinstance(scffor_ops[6], Yield)
+    assert isinstance(scffor_ops[8], Yield)
 
     assert type(ops[7] == Call)
     assert type(ops[8] == StoreOp)
diff --git a/tests/test_xdsl_operator.py b/tests/test_xdsl_operator.py
index 9fa0fee21a..f06df254de 100644
--- a/tests/test_xdsl_operator.py
+++ b/tests/test_xdsl_operator.py
@@ -32,15 +32,16 @@ def test_create_xdsl_operator():
     assert type(ops[6] == For)
 
     scffor_ops = list(ops[6].regions[0].blocks[0].ops)
-    assert len(scffor_ops) == 4
+    assert len(scffor_ops) == 5
 
     # First
     assert isinstance(scffor_ops[0], LoadOp)
     assert isinstance(scffor_ops[1], ApplyOp)
     assert isinstance(scffor_ops[2], StoreOp)
+    assert isinstance(scffor_ops[3], LoadOp)
 
     # Yield
-    assert isinstance(scffor_ops[3], Yield)
+    assert isinstance(scffor_ops[4], Yield)
 
     assert type(ops[7] == Call)
     assert type(ops[8] == StoreOp)
diff --git a/tests/test_xdsl_passes.py b/tests/test_xdsl_passes.py
index 9fce974b8d..ff937d92b2 100644
--- a/tests/test_xdsl_passes.py
+++ b/tests/test_xdsl_passes.py
@@ -90,10 +90,13 @@ def test_xdsl_III():
 
     scffor_ops = list(ops[6].regions[0].blocks[0].ops)
 
+    assert len(scffor_ops) == 5
+
     assert isinstance(scffor_ops[0], LoadOp)
     assert isinstance(scffor_ops[1], ApplyOp)
     assert isinstance(scffor_ops[2], StoreOp)
-    assert isinstance(scffor_ops[3], Yield)
+    assert isinstance(scffor_ops[3], LoadOp)
+    assert isinstance(scffor_ops[4], Yield)
 
     assert type(ops[7] == Call)
     assert type(ops[8] == StoreOp)

From 233b70ab008363b4786288f9c56f3092d4facb09 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 12:03:57 +0100
Subject: [PATCH 12/25] Bump to xDSL main.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index a187172c3e..47b2cb718f 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
+        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 7cd7d7cbb1..04bd3d8208 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
+        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 6dd145604e..bff3de5933 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
+        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 39bc907225..9b80ddf909 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
+        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 796882d73e..9f9d43106f 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@935675efd470505028466917126f8cc8f64ce4e3
+        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
         
     - name: Test no-MPI, no-Openmp
       run: |

From ef1fbf56c6e44e5652e7b1ec60ec46526aab8baa Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 14:22:57 +0100
Subject: [PATCH 13/25] Bump to xDSL supporting inplace.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 47b2cb718f..35b24eec97 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
+        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index 04bd3d8208..d952fb4765 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
+        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index bff3de5933..b4bc3c34f3 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
+        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 9b80ddf909..60471ccdc8 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
+        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 9f9d43106f..9ab998511f 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@bd700e9665c040d478a4f3bfa286ede66216b5ed
+        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
         
     - name: Test no-MPI, no-Openmp
       run: |

From 6e0673ca3a2c36f927cd8f53722b554b78a55216 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 14:30:46 +0100
Subject: [PATCH 14/25] Litlle dictionary fix?

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 5 ++---
 tests/test_xdsl_op_correctness.py    | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index 48d2489d12..f540e405a5 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -386,13 +386,12 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
 
         lb = stencil.IndexAttr.get(*([0] * len(shape)))
         ub = stencil.IndexAttr.get(*shape)
-
         store = stencil.StoreOp.get(
             apply.res[0],
-            self.function_values[self.out_time_buffer],
+            self.block_args[self.out_time_buffer],
             stencil.StencilBoundsAttr(zip(lb, ub)),
         )
-        load = stencil.LoadOp.get(self.function_values[self.out_time_buffer])
+        load = stencil.LoadOp.get(self.block_args[self.out_time_buffer])
         load.res.name_hint = f"{write_function.name}_t{self.out_time_buffer[1]}_temp"  # noqa
         self.temps[self.out_time_buffer] = load.res
 
diff --git a/tests/test_xdsl_op_correctness.py b/tests/test_xdsl_op_correctness.py
index eb2a10c890..4dd5b7e758 100644
--- a/tests/test_xdsl_op_correctness.py
+++ b/tests/test_xdsl_op_correctness.py
@@ -130,9 +130,6 @@ def test_symbol_I():
     assert ops[0].result.name_hint == a.name
     assert type(ops[0] == Return)
 
-
-# This test should fail, as we are trying to use an inplace operation
-@pytest.mark.xfail(reason="Cannot store to a field that is loaded from")
 def test_inplace():
     # Define a simple Devito Operator
     grid = Grid(shape=(3, 3))

From b9b1bbcbfcdc10fd8b78fa0b15e86a4f32cece8c Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 14:32:58 +0100
Subject: [PATCH 15/25] Flake

---
 tests/test_xdsl_op_correctness.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_xdsl_op_correctness.py b/tests/test_xdsl_op_correctness.py
index 4dd5b7e758..6f8c5efed9 100644
--- a/tests/test_xdsl_op_correctness.py
+++ b/tests/test_xdsl_op_correctness.py
@@ -130,6 +130,7 @@ def test_symbol_I():
     assert ops[0].result.name_hint == a.name
     assert type(ops[0] == Return)
 
+
 def test_inplace():
     # Define a simple Devito Operator
     grid = Grid(shape=(3, 3))

From 26218f263b0025d3fee53811f88c3f2183dfc49e Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 14:41:41 +0100
Subject: [PATCH 16/25] Try this dictionary Some more comments would seem
 useful.

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index f540e405a5..240d32ec09 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -386,12 +386,12 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
 
         lb = stencil.IndexAttr.get(*([0] * len(shape)))
         ub = stencil.IndexAttr.get(*shape)
-        store = stencil.StoreOp.get(
+        stencil.StoreOp.get(
             apply.res[0],
-            self.block_args[self.out_time_buffer],
+            self.function_args[self.out_time_buffer],
             stencil.StencilBoundsAttr(zip(lb, ub)),
         )
-        load = stencil.LoadOp.get(self.block_args[self.out_time_buffer])
+        load = stencil.LoadOp.get(self.function_args[self.out_time_buffer])
         load.res.name_hint = f"{write_function.name}_t{self.out_time_buffer[1]}_temp"  # noqa
         self.temps[self.out_time_buffer] = load.res
 

From 89e774ce100a4ff5fb62fb5f295be499376721df Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 14:52:49 +0100
Subject: [PATCH 17/25] Dictionary tweak.

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index 48d2489d12..11c4d8a718 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -198,7 +198,7 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             if output_indexed is not None:
                 space_offsets = ([node.indices[d] - output_indexed.indices[d]
                                  for d in node.function.space_dimensions])
-                temp = self.function_values[(node.function, time_offset)]
+                temp = self.apply_temps[(node.function, time_offset)]
                 access = stencil.AccessOp.get(temp, space_offsets)
                 return access.res
             # Otherwise, generate a load op
@@ -378,8 +378,6 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
             apply_arg.name_hint = apply_op.name_hint.replace("temp", "blk")
 
         self.apply_temps = {k: v for k, v in zip(read_functions, apply.region.block.args)}
-        # Update the function values with the new temps
-        self.function_values |= self.apply_temps
 
         with ImplicitBuilder(apply.region.block):
             stencil.ReturnOp.get([self._visit_math_nodes(dim, eq.rhs, eq.lhs)])

From a2a69225f2f0bbf35f105336ce42a0058afdcecf Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 15:01:10 +0100
Subject: [PATCH 18/25] Sync

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index f2f2383bb6..11c4d8a718 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -384,12 +384,13 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
 
         lb = stencil.IndexAttr.get(*([0] * len(shape)))
         ub = stencil.IndexAttr.get(*shape)
-        stencil.StoreOp.get(
+
+        store = stencil.StoreOp.get(
             apply.res[0],
-            self.function_args[self.out_time_buffer],
+            self.function_values[self.out_time_buffer],
             stencil.StencilBoundsAttr(zip(lb, ub)),
         )
-        load = stencil.LoadOp.get(self.function_args[self.out_time_buffer])
+        load = stencil.LoadOp.get(self.function_values[self.out_time_buffer])
         load.res.name_hint = f"{write_function.name}_t{self.out_time_buffer[1]}_temp"  # noqa
         self.temps[self.out_time_buffer] = load.res
 

From 5366f327d7ce57e423d32a0b5ac06d9ae49dd496 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 17:11:21 +0100
Subject: [PATCH 19/25] Revert more subtle inplace test and present a working
 one..

---
 tests/test_xdsl_op_correctness.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/test_xdsl_op_correctness.py b/tests/test_xdsl_op_correctness.py
index 6f8c5efed9..2934770e97 100644
--- a/tests/test_xdsl_op_correctness.py
+++ b/tests/test_xdsl_op_correctness.py
@@ -131,7 +131,22 @@ def test_symbol_I():
     assert type(ops[0] == Return)
 
 
-def test_inplace():
+def test_inplace_I():
+    # Define a simple Devito Operator
+    grid = Grid(shape=(3, 3))
+    u = TimeFunction(name="u", grid=grid, time_order=2)
+
+    u.data[:] = 0.0001
+
+    eq0 = Eq(u, u + 2)
+
+    xdsl_op = Operator([eq0], opt="xdsl")
+    xdsl_op.apply(time_M=5, dt=0.1)
+
+
+# This test should fail, as we are trying to use an inplace operation with some dependencies
+@pytest.mark.xfail(reason="Cannot store to a field that is loaded from")
+def test_inplace_II():
     # Define a simple Devito Operator
     grid = Grid(shape=(3, 3))
     u = TimeFunction(name='u', grid=grid, time_order=2)

From fb54a023db42938f4b5edfde6639399d78c6b989 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 17:13:10 +0100
Subject: [PATCH 20/25] Flake.

---
 tests/test_xdsl_op_correctness.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_xdsl_op_correctness.py b/tests/test_xdsl_op_correctness.py
index 2934770e97..8e74b23992 100644
--- a/tests/test_xdsl_op_correctness.py
+++ b/tests/test_xdsl_op_correctness.py
@@ -144,7 +144,8 @@ def test_inplace_I():
     xdsl_op.apply(time_M=5, dt=0.1)
 
 
-# This test should fail, as we are trying to use an inplace operation with some dependencies
+# This test should fail, as we are trying to use an inplace operation with some
+# dependencies
 @pytest.mark.xfail(reason="Cannot store to a field that is loaded from")
 def test_inplace_II():
     # Define a simple Devito Operator

From 3eafa472e67483fc04dccf925d7ad3e0cda3ebd1 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 17:29:26 +0100
Subject: [PATCH 21/25] Lift another xfail

---
 tests/test_xdsl_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_xdsl_base.py b/tests/test_xdsl_base.py
index 72ddfcb16a..4cba6d9a7a 100644
--- a/tests/test_xdsl_base.py
+++ b/tests/test_xdsl_base.py
@@ -948,7 +948,6 @@ def test_function_III():
     assert np.isclose(norm(v), devito_norm_v)
 
 
-@pytest.mark.xfail(reason="Operation does not verify: Cannot Load and Store the same field!")  # noqa
 def test_function_IV():
     # Define a Devito Operator with multiple eqs
     grid = Grid(shape=(4, 4))

From 06293fee2a0630ffac8b5f1f6c6ee82786f37028 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 17:47:52 +0100
Subject: [PATCH 22/25] Implement return type conversion.

---
 devito/ir/xdsl_iet/cluster_to_ssa.py | 37 +++++++++++++--------
 tests/test_xdsl_base.py              | 49 +++++++++++++---------------
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
index 37713b5322..66a9741c35 100644
--- a/devito/ir/xdsl_iet/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -287,7 +287,7 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             SSAargs = (self._visit_math_nodes(dim, arg, output_indexed)
                        for arg in node.args)
             return reduce(lambda x, y : arith.AndI(x, y).result, SSAargs)
-        
+
         # Trigonometric functions
         elif isinstance(node, sin):
             assert len(node.args) == 1, "Expected single argument for sin."
@@ -298,13 +298,13 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             assert len(node.args) == 1, "Expected single argument for cos."           
             return math.CosOp(self._visit_math_nodes(dim, node.args[0],
                               output_indexed)).result
-        
+
         elif isinstance(node, tan):
             assert len(node.args) == 1, "Expected single argument for TanOp."
-            
+
             return math.TanOp(self._visit_math_nodes(dim, node.args[0],
                               output_indexed)).result
-                   
+
         elif isinstance(node, Relational):
             if isinstance(node, GreaterThan):
                 mnemonic = "sge"
@@ -382,7 +382,20 @@ def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
         self.function_values |= self.apply_temps
 
         with ImplicitBuilder(apply.region.block):
-            stencil.ReturnOp.get([self._visit_math_nodes(dim, eq.rhs, eq.lhs)])
+            result = self._visit_math_nodes(dim, eq.rhs, eq.lhs)
+            expected_type = apply.res[0].type.get_element_type()
+            match expected_type:
+                case result.type:
+                    pass
+                case builtin.f32:
+                    if result.type == IndexType():
+                        result = arith.IndexCastOp(result, builtin.i64).result
+                    result = arith.SIToFPOp(result, builtin.f32).result
+                case builtin.IndexType:
+                    result = arith.IndexCastOp(result, IndexType()).result
+                case _:
+                    raise Exception(f"Unexpected result type {type(result)}")
+            stencil.ReturnOp.get([result])
 
         lb = stencil.IndexAttr.get(*([0] * len(shape)))
         ub = stencil.IndexAttr.get(*shape)
@@ -439,7 +452,6 @@ def build_condition(self, dim: SteppingDimension, eq: BooleanFunction):
             self.build_generic_step_expression(dim, eq)
             scf.Yield()
 
-
     def build_time_loop(
         self, eqs: list[Any], step_dim: SteppingDimension, **kwargs
     ):
@@ -450,7 +462,7 @@ def build_time_loop(
         ub = iet_ssa.LoadSymbolic.get(
             step_dim.symbolic_max._C_name, IndexType()
         )
-        
+
         one = arith.Constant.from_int_and_width(1, IndexType())
 
         # Devito iterates from time_m to time_M *inclusive*, MLIR only takes
@@ -497,7 +509,7 @@ def build_time_loop(
             for i, (f, t) in enumerate(self.time_buffers)
         }
         self.function_values |= self.block_args
-        
+
         # Name the block argument for debugging
         for (f, t), arg in self.block_args.items():
             arg.name_hint = f"{f.name}_t{t}"
@@ -513,8 +525,7 @@ def build_time_loop(
 
     def lower_devito_Eqs(self, eqs: list[Any], **kwargs):
         # Lower devito Equations to xDSL
-        
-        
+
         for eq in eqs:
             lowered = self.operator._lower_exprs(as_tuple(eq), **kwargs)
             if isinstance(eq, Eq):
@@ -546,7 +557,7 @@ def _lower_injection(self, eqs: list[LoweredEq]):
                 lb = arith.Constant.from_int_and_width(int(lower), IndexType())
             else:
                 raise NotImplementedError(f"Lower bound of type {type(lower)} not supported")
-            
+
             try:
                 name = interval.dim.symbolic_min.name
             except:
@@ -633,7 +644,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp:
         # Instantiate the module.
         self.function_values: dict[tuple[Function, int], SSAValue] = {}
         self.symbol_values: dict[str, SSAValue] = {}
-        
+
         module = ModuleOp(Region([block := Block([])]))
         with ImplicitBuilder(block):
             # Get all functions used in the equations
@@ -647,7 +658,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp:
                         functions.add(f.function)
 
                 elif isinstance(eq, Injection):
-                    
+
                     functions.add(eq.field.function)
                     for f in retrieve_functions(eq.expr):
                         if isinstance(f, PointSource):
diff --git a/tests/test_xdsl_base.py b/tests/test_xdsl_base.py
index 39aa97828b..23eeaf97ce 100644
--- a/tests/test_xdsl_base.py
+++ b/tests/test_xdsl_base.py
@@ -972,6 +972,20 @@ def test_function_IV():
     assert np.isclose(norm(u), devito_norm_u)
 
 
+def test_function_V():
+    grid = Grid(shape=(5, 5))
+    x, y = grid.dimensions
+
+    f = Function(name="f", grid=grid)
+
+    eqns = [Eq(f, 2)]
+
+    op = Operator(eqns, opt="xdsl")
+    op.apply()
+
+    assert np.all(f.data == 2)
+
+
 class TestTrigonometric(object):
 
     @pytest.mark.parametrize('deg, exp', ([90.0, 3.5759869], [30.0, 3.9521265],
@@ -1028,37 +1042,20 @@ def test_tan(self, deg, exp):
         assert np.isclose(norm(u), exp, rtol=1e-4)
 
 
-class TestOperatorUnsupported(object):
+def test_forward_assignment():
+    # simple forward assignment
 
-    @pytest.mark.xfail(reason="stencil.return operation does not verify for i64")
-    def test_forward_assignment(self):
-        # simple forward assignment
-
-        grid = Grid(shape=(4, 4))
-        u = TimeFunction(name="u", grid=grid, space_order=2)
-        u.data[:, :, :] = 0
-
-        eq0 = Eq(u.forward, 1)
-
-        op = Operator([eq0], opt='xdsl')
-
-        op.apply(time_M=1)
-
-        assert np.isclose(norm(u), 5.6584, rtol=0.001)
-
-    @pytest.mark.xfail(reason="stencil.return operation does not verify for i64")
-    def test_function(self):
-        grid = Grid(shape=(5, 5))
-        x, y = grid.dimensions
+    grid = Grid(shape=(4, 4))
+    u = TimeFunction(name="u", grid=grid, space_order=2)
+    u.data[:, :, :] = 0
 
-        f = Function(name="f", grid=grid)
+    eq0 = Eq(u.forward, 1)
 
-        eqns = [Eq(f, 2)]
+    op = Operator([eq0], opt='xdsl')
 
-        op = Operator(eqns, opt='xdsl')
-        op.apply()
+    op.apply(time_M=1)
 
-        assert np.all(f.data == 4)
+    assert np.isclose(norm(u), 5.6584, rtol=0.001)
 
 
 class TestElastic():

From 74746f56f7b1e91037aff9a22b841979974b3ffa Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Wed, 21 Aug 2024 17:55:59 +0100
Subject: [PATCH 23/25] Update to PR.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index 35b24eec97..ead38f6031 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
+        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index d952fb4765..a95283c43a 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
+        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index b4bc3c34f3..1f8718fa5b 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
+        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index 60471ccdc8..f8f2e7493c 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
+        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 9ab998511f..a4d0b6ffcf 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@c86953132b9bd52b6b9dbbd3463bde52c5c15fda
+        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
         
     - name: Test no-MPI, no-Openmp
       run: |

From 51e6f97ded417939105087eabc5a7fe63f730412 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Fri, 30 Aug 2024 14:47:02 +0100
Subject: [PATCH 24/25] GPU pipeline canonicalization

---
 devito/xdsl_core/xdsl_gpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/devito/xdsl_core/xdsl_gpu.py b/devito/xdsl_core/xdsl_gpu.py
index 9f281015b3..c84cf14492 100644
--- a/devito/xdsl_core/xdsl_gpu.py
+++ b/devito/xdsl_core/xdsl_gpu.py
@@ -140,7 +140,10 @@ def _jit_compile(self):
 
 def generate_XDSL_GPU_PIPELINE():
     passes = [
+        "canonicalize",
+        "cse",
         "shape-inference",
+        "stencil-bufferize",
         "convert-stencil-to-ll-mlir",
         "reconcile-unrealized-casts",
         "printf-to-llvm",

From 72c909cf72923fee0b5102f3e21b29186fc72675 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Fri, 30 Aug 2024 15:14:47 +0100
Subject: [PATCH 25/25] Bump to `main`.

---
 .github/workflows/ci-lit.yml             | 2 +-
 .github/workflows/ci-mlir-mpi-openmp.yml | 2 +-
 .github/workflows/ci-mlir-mpi.yml        | 2 +-
 .github/workflows/ci-mlir-openmp.yml     | 2 +-
 .github/workflows/ci-mlir.yml            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-lit.yml b/.github/workflows/ci-lit.yml
index ead38f6031..75c2f350db 100644
--- a/.github/workflows/ci-lit.yml
+++ b/.github/workflows/ci-lit.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install requirements and xDSL
       run: |
-        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
+        pip install git+https://github.com/xdslproject/xdsl@48b530615ab877e980d1f6339c18e63970011311
         pip install -e .[tests]
 
     - name: Execute lit tests
diff --git a/.github/workflows/ci-mlir-mpi-openmp.yml b/.github/workflows/ci-mlir-mpi-openmp.yml
index a95283c43a..900fbbc13e 100644
--- a/.github/workflows/ci-mlir-mpi-openmp.yml
+++ b/.github/workflows/ci-mlir-mpi-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
+        pip install git+https://github.com/xdslproject/xdsl@48b530615ab877e980d1f6339c18e63970011311
 
     - name: Test with MPI + openmp
       run: |
diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 1f8718fa5b..d699dda4ef 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
+        pip install git+https://github.com/xdslproject/xdsl@48b530615ab877e980d1f6339c18e63970011311
 
     - name: Test with MPI - no Openmp
       run: |
diff --git a/.github/workflows/ci-mlir-openmp.yml b/.github/workflows/ci-mlir-openmp.yml
index f8f2e7493c..f78144d33a 100644
--- a/.github/workflows/ci-mlir-openmp.yml
+++ b/.github/workflows/ci-mlir-openmp.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
+        pip install git+https://github.com/xdslproject/xdsl@48b530615ab877e980d1f6339c18e63970011311
 
     - name: Test no-MPI, Openmp
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index a4d0b6ffcf..fc16c030f2 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -35,7 +35,7 @@ jobs:
     - name: Install requirements and xDSL
       run: |
         pip install -e .[tests]
-        pip install git+https://github.com/xdslproject/xdsl@c0bd8d3fb67b63950601a43b0779d2c35e29f3b7
+        pip install git+https://github.com/xdslproject/xdsl@48b530615ab877e980d1f6339c18e63970011311
         
     - name: Test no-MPI, no-Openmp
       run: |