diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7e46716cb3..67f50d2f31 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -3,8 +3,6 @@
 # Please see the documentation for all configuration options:
 # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 
-# For Devito-xdsl fork
-
 version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
diff --git a/devito/core/__init__.py b/devito/core/__init__.py
index 92b4a1c025..2fae24864f 100644
--- a/devito/core/__init__.py
+++ b/devito/core/__init__.py
@@ -4,7 +4,6 @@
                              Cpu64FsgCOperator, Cpu64FsgOmpOperator,
                              Cpu64CustomOperator)
 
-from devito.core.cpu_xdsl import XdslnoopOperator, XdslAdvOperator
 from devito.core.intel import (Intel64AdvCOperator, Intel64AdvOmpOperator,
                                Intel64FsgCOperator, Intel64FsgOmpOperator)
 from devito.core.arm import ArmAdvCOperator, ArmAdvOmpOperator
@@ -12,10 +11,13 @@
 from devito.core.gpu import (DeviceNoopOmpOperator, DeviceNoopAccOperator,
                              DeviceAdvOmpOperator, DeviceAdvAccOperator,
                              DeviceFsgOmpOperator, DeviceFsgAccOperator,
-                             DeviceCustomOmpOperator, DeviceCustomAccOperator,
-                             XdslAdvDeviceOperator)
+                             DeviceCustomOmpOperator, DeviceCustomAccOperator)
 from devito.operator.registry import operator_registry
 
+# Import XDSL Operators
+from devito.xdsl_core.xdsl_cpu import XdslnoopOperator, XdslAdvOperator
+from devito.xdsl_core.xdsl_gpu import XdslAdvDeviceOperator
+
 # Register CPU Operators
 operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'C')
 operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'openmp')
diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index ecf6dce42f..bca6e3e13d 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -4,13 +4,12 @@
 from devito.exceptions import InvalidOperator
 from devito.passes.equations import collect_derivatives
 from devito.passes.clusters import (Lift, blocking, buffering, cire, cse,
-                                    factorize, fission, fuse, optimize_pows,
-                                    optimize_hyperplanes)
-from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, linearize, mpiize,
-                               hoist_prodders, relax_incr_dimensions)
+                                    factorize, fission, fuse, optimize_hyperplanes,
+                                    optimize_pows)
+from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, hoist_prodders,
+                               linearize, mpiize, relax_incr_dimensions)
 from devito.tools import timed_pass
 
-
 __all__ = ['Cpu64NoopCOperator', 'Cpu64NoopOmpOperator', 'Cpu64AdvCOperator',
            'Cpu64AdvOmpOperator', 'Cpu64FsgCOperator', 'Cpu64FsgOmpOperator',
            'Cpu64CustomOperator']
diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index 1570d1ac97..6742aef69f 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -1,17 +1,9 @@
-from contextlib import redirect_stdout
-import io
-import os
-import sys
 from functools import partial
-from io import StringIO
 
 import numpy as np
-from devito.arch.archinfo import get_nvidia_cc
 
-from devito.core.operator import CoreOperator, CustomOperator, ParTile
 
-from devito.core.cpu_xdsl import (XdslAdvOperator, generate_mlir_pipeline,
-                                  generate_pipeline)
+from devito.core.operator import CoreOperator, CustomOperator, ParTile
 
 from devito.exceptions import InvalidOperator
 from devito.operator.operator import rcompile
@@ -22,15 +14,9 @@
                                     optimize_pows)
 from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, hoist_prodders,
                                linearize, pthreadify, relax_incr_dimensions)
-from devito.logger import info, perf
-from devito.mpi import MPI
 
 from devito.tools import as_tuple, timed_pass
 
-from xdsl.printer import Printer
-from xdsl.xdsl_opt_main import xDSLOptMain
-
-from devito.ir.ietxdsl.cluster_to_ssa import finalize_module_with_globals
 
 __all__ = ['DeviceNoopOperator', 'DeviceAdvOperator', 'DeviceCustomOperator',
            'DeviceNoopOmpOperator', 'DeviceAdvOmpOperator', 'DeviceFsgOmpOperator',
@@ -376,122 +362,6 @@ def _make_iet_passes_mapper(cls, **kwargs):
         return mapper
 
 
-class XdslAdvDeviceOperator(XdslAdvOperator):
-
-    _Target = DeviceOmpTarget
-
-    def _jit_compile(self):
-        """
-        JIT-compile the C code generated by the Operator.
-        It is ensured that JIT compilation will only be performed
-        once per Operator, reagardless of how many times this method
-        is invoked.
-        """
-        with self._profiler.timer_on('jit-compile'):
-            is_mpi = MPI.Is_initialized()
-            is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX'
-
-            if is_mpi and is_gpu:
-                raise RuntimeError("Cannot run MPI+GPU for now!")
-
-            # specialize the code for the specific apply parameters
-            finalize_module_with_globals(self._module, self._jit_kernel_constants,
-                                         gpu_boilerplate=is_gpu)
-
-            # print module as IR
-            module_str = StringIO()
-            Printer(stream=module_str).print(self._module)
-            module_str = module_str.getvalue()
-
-            xdsl_pipeline = generate_XDSL_GPU_PIPELINE()
-            # Get GPU blocking shapes
-            block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])]  # noqa
-            block_sizes = ','.join(str(bs) for bs in block_sizes)
-            mlir_pipeline = generate_MLIR_GPU_PIPELINE(block_sizes)
-
-            # allow jit backdooring to provide your own xdsl code
-            backdoor = os.getenv('XDSL_JIT_BACKDOOR')
-            if backdoor is not None:
-                if os.path.splitext(backdoor)[1] == ".so":
-                    info(f"JIT Backdoor: skipping compilation and using {backdoor}")
-                    self._tf.name = backdoor
-                    return
-                print("JIT Backdoor: loading xdsl file from: " + backdoor)
-                with open(backdoor, 'r') as f:
-                    module_str = f.read()
-
-            # Uncomment to print the module_str
-            # Printer().print(module_str)
-            source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
-            source_file = open(source_name, "w")
-            source_file.write(module_str)
-            source_file.close()
-
-            # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
-            cflags = "-O3 -march=native -mtune=native -lmlir_c_runner_utils"
-
-            try:
-                cc = "clang"
-
-                cflags += " -lmlir_cuda_runtime "
-                cflags += " -shared "
-
-                # TODO More detailed error handling manually,
-                # instead of relying on a bash-only feature.
-
-                # xdsl-opt, get xDSL IR
-                # TODO: Remove quotes in pipeline; currently workaround with [1:-1]
-                # Run the first pipeline, mostly xDSL-centric
-                xdsl_args = [source_name,
-                             "--allow-unregistered-dialect",
-                             "-p",
-                             xdsl_pipeline[1:-1],]
-                # We use the Python API to run xDSL rather than a subprocess
-                # This avoids reimport overhead
-                xdsl = xDSLOptMain(args=xdsl_args)
-                out = io.StringIO()
-                perf("-----------------")
-                perf(f"xdsl-opt {' '.join(xdsl_args)}")
-                with redirect_stdout(out):
-                    xdsl.run()
-
-                # To use as input in the next stage
-                out.seek(0)
-                # Run the second pipeline, mostly MLIR-centric
-                xdsl_mlir_args = ["--allow-unregistered-dialect",
-                                  "-p",
-                                  mlir_pipeline]
-                # We drive it though xDSL rather than a mlir-opt call for:
-                # - ability to use xDSL replacement passes in the middle
-                # - Avoiding complex process cmanagement code here: xDSL provides
-                xdsl = xDSLOptMain(args=xdsl_mlir_args)
-                out2 = io.StringIO()
-                perf("-----------------")
-                perf(f"xdsl-opt {' '.join(xdsl_mlir_args)}")
-                with redirect_stdout(out2):
-                    old_stdin = sys.stdin
-                    sys.stdin = out
-                    xdsl.run()
-                    sys.stdin = old_stdin
-
-                # mlir-translate to translate to LLVM-IR
-                mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir'
-                out = self.compile(mlir_translate_cmd, out2.getvalue())
-
-                # Compile with clang and get LLVM-IR
-                clang_cmd = f'{cc} {cflags} -o {self._tf.name} {self._interop_tf.name} -xir -'  # noqa
-                out = self.compile(clang_cmd, out)
-
-            except Exception as ex:
-                print("error")
-                raise ex
-
-        elapsed = self._profiler.py_timers['jit-compile']
-
-        perf("XDSLAdvDeviceOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
-             (self.name, source_name, elapsed))
-
-
 # OpenACC
 
 class DeviceAccOperatorMixin(object):
@@ -569,57 +439,3 @@ def reads_if_on_host(c):
             return set()
 
     return runs_on_host, reads_if_on_host
-
-
-def generate_XDSL_GPU_PIPELINE():
-    passes = [
-        "stencil-shape-inference",
-        "convert-stencil-to-ll-mlir",
-        "reconcile-unrealized-casts",
-        "printf-to-llvm",
-        "canonicalize"
-    ]
-
-    return generate_pipeline(passes)
-
-
-# gpu-launch-sink-index-computations seemed to have no impact
-def generate_MLIR_GPU_PIPELINE(block_sizes):
-    return generate_pipeline([
-        generate_mlir_pipeline([
-            "test-math-algebraic-simplification",
-            f"scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}}",
-        ]),
-        "gpu-map-parallel-loops",
-        generate_mlir_pipeline([
-            "convert-parallel-loops-to-gpu",
-            "lower-affine",
-            "canonicalize",
-            "cse",
-            "fold-memref-alias-ops",
-            "gpu-launch-sink-index-computations",
-            "gpu-kernel-outlining",
-            "canonicalize{region-simplify}",
-            "cse",
-            "fold-memref-alias-ops",
-            "expand-strided-metadata",
-            "lower-affine",
-            "canonicalize",
-            "cse",
-            "func.func(gpu-async-region)",
-            "canonicalize",
-            "cse",
-            "convert-arith-to-llvm{index-bitwidth=64}",
-            "convert-scf-to-cf",
-            "convert-cf-to-llvm{index-bitwidth=64}",
-            "canonicalize",
-            "cse",
-            "convert-func-to-llvm{use-bare-ptr-memref-call-conv}",
-            f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}",
-            "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)",
-            "gpu-to-llvm",
-            "gpu-module-to-binary",
-            "canonicalize",
-            "cse"
-        ]),
-    ])[1:-1]
diff --git a/devito/ir/ietxdsl/__init__.py b/devito/ir/ietxdsl/__init__.py
deleted file mode 100644
index dac6c2b1a8..0000000000
--- a/devito/ir/ietxdsl/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from devito.ir.ietxdsl.cluster_to_ssa import finalize_module_with_globals
-
-from devito.ir.ietxdsl.profiling import apply_timers
-
-# flake8: noqa
diff --git a/devito/ir/xdsl_iet/__init__.py b/devito/ir/xdsl_iet/__init__.py
new file mode 100644
index 0000000000..d7005bb6d1
--- /dev/null
+++ b/devito/ir/xdsl_iet/__init__.py
@@ -0,0 +1,5 @@
+from devito.ir.xdsl_iet.cluster_to_ssa import finalize_module_with_globals
+
+from devito.ir.xdsl_iet.profiling import apply_timers
+
+# flake8: noqa
diff --git a/devito/ir/ietxdsl/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py
similarity index 84%
rename from devito/ir/ietxdsl/cluster_to_ssa.py
rename to devito/ir/xdsl_iet/cluster_to_ssa.py
index c8c1cbe26b..a28b13dd85 100644
--- a/devito/ir/ietxdsl/cluster_to_ssa.py
+++ b/devito/ir/xdsl_iet/cluster_to_ssa.py
@@ -1,14 +1,14 @@
 from functools import reduce
-import numpy as np
 
 # ------------- General imports -------------#
 
 from typing import Any, Iterable
 from dataclasses import dataclass, field
-from sympy import Add, And, Expr, Float, GreaterThan, Indexed, Integer, LessThan, Mod, Mul, Number, Pow, StrictGreaterThan, StrictLessThan, Symbol, floor
+from sympy import (Add, And, Expr, Float, GreaterThan, Indexed, Integer, LessThan,
+                   Number, Pow, StrictGreaterThan, StrictLessThan, Symbol, floor,
+                   Mul)
 from sympy.core.relational import Relational
 from sympy.logic.boolalg import BooleanFunction
-from devito.ir.equations.equation import OpInc
 from devito.operations.interpolators import Injection
 from devito.operator.operator import Operator
 from devito.symbolics.search import retrieve_dimensions, retrieve_functions
@@ -21,8 +21,9 @@
 from devito.types.equation import Eq
 
 # ------------- xdsl imports -------------#
-from xdsl.dialects import (arith, builtin, func, memref, scf,
-                           stencil, gpu)
+from xdsl.dialects import arith, func, memref, scf, stencil, gpu, builtin
+from xdsl.dialects.builtin import (ModuleOp, UnrealizedConversionCastOp, StringAttr,
+                                   IndexType)
 from xdsl.dialects.experimental import math
 from xdsl.ir import Block, Operation, OpResult, Region, SSAValue
 from xdsl.pattern_rewriter import (
@@ -30,8 +31,7 @@
     PatternRewriter,
     PatternRewriteWalker,
     RewritePattern,
-    op_type_rewrite_pattern,
-    InsertPoint
+    op_type_rewrite_pattern
 )
 from xdsl.builder import ImplicitBuilder
 from xdsl.transforms.experimental.convert_stencil_to_ll_mlir import StencilToMemRefType
@@ -40,21 +40,13 @@
 from devito import Grid, SteppingDimension
 from devito.ir.equations import LoweredEq
 from devito.symbolics import retrieve_function_carriers
-from devito.tools.data_structures import OrderedSet
-from devito.types.dense import DiscreteFunction, Function, TimeFunction
-from devito.types.equation import Eq
-from devito.types.mlir_types import dtype_to_xdsltype
+from devito.types.mlir_types import dtype_to_xdsltype, ptr_of, f32
 
 # ------------- devito-xdsl SSA imports -------------#
-from devito.ir.ietxdsl import iet_ssa
-from devito.ir.ietxdsl.utils import is_int, is_float, dtypes_to_xdsltypes
-from devito.types.mlir_types import f32, ptr_of
-
-
-from examples.seismic.source import PointSource
-from tests.test_interpolation import points
-from tests.test_timestepping import d
+from devito.ir.xdsl_iet import iet_ssa
+from devito.ir.xdsl_iet.utils import is_int, is_float, dtypes_to_xdsltypes
 
+from examples.seismic import PointSource
 
 # flake8: noqa
 
@@ -91,6 +83,7 @@ def setup_memref_args(functions):
 
     return args
 
+
 class ExtractDevitoStencilConversion:
     """
     Lower Devito equations to the stencil dialect
@@ -103,12 +96,6 @@ class ExtractDevitoStencilConversion:
     symbol_values: dict[str, SSAValue]
     time_offs: int
 
-    def __init__(self):
-        self.temps = dict()
-        self.symbol_values = dict()
-
-    time_offs: int
-
     def __init__(self, operator: type[Operator]):
         self.temps = dict()
         self.operator = operator
@@ -144,13 +131,6 @@ def convert_symbol_eq(self, symbol: Symbol, rhs: LoweredEq, **kwargs):
         self.symbol_values[symbol.name] = self._visit_math_nodes(None, rhs, None)
         self.symbol_values[symbol.name].name_hint = symbol.name
 
-    def convert_symbol_eq(self, symbol: Symbol, rhs: LoweredEq, **kwargs):
-        """
-        Convert a symbol equation to xDSL.
-        """
-        self.symbol_values[symbol.name] = self._visit_math_nodes(None, rhs, None)
-        self.symbol_values[symbol.name].name_hint = symbol.name
-
     def _convert_eq(self, eq: LoweredEq, **kwargs):
         """
         # Docs here Need rewriting
@@ -215,25 +195,27 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
             # If we are in a stencil (encoded by having the output_indexed passed), we
             # compute the relative space offsets and make it a stencil offset
             if output_indexed is not None:
-                space_offsets = [node.indices[d] - output_indexed.indices[d] for d in node.function.space_dimensions]
+                space_offsets = ([node.indices[d] - output_indexed.indices[d]
+                                 for d in node.function.space_dimensions])
                 temp = self.function_values[(node.function, time_offset)]
                 access = stencil.AccessOp.get(temp, space_offsets)
                 return access.res
             # Otherwise, generate a load op
             else:
                 temp = self.function_values[(node.function, time_offset)]
-                memtemp = builtin.UnrealizedConversionCastOp.get(temp, StencilToMemRefType(temp.type)).results[0]
+                memreftype = StencilToMemRefType(temp.type)
+                memtemp = UnrealizedConversionCastOp.get(temp, memreftype).results[0]
                 memtemp.name_hint = temp.name_hint + "_mem"
                 indices = node.indices
                 if isinstance(node.function, TimeFunction):
                     indices = indices[1:]
-                ssa_indices = [self._visit_math_nodes(dim, i, output_indexed) for i in node.indices]
+                ssa_indices = ([self._visit_math_nodes(dim, i, output_indexed)
+                                for i in node.indices])
                 for i, ssa_i in enumerate(ssa_indices):
                     if isinstance(ssa_i.type, builtin.IntegerType):
-                        ssa_indices[i] = arith.IndexCastOp(ssa_i, builtin.IndexType())
+                        ssa_indices[i] = arith.IndexCastOp(ssa_i, IndexType())
                 return memref.Load.get(memtemp, ssa_indices).res
 
-            import pdb; pdb.set_trace()
         # Handle Integers
         elif isinstance(node, Integer):
             cst = arith.Constant.from_int_and_width(int(node), builtin.i64)
@@ -294,13 +276,16 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
         # Handle Mod
         elif isinstance(node, INT):
             assert len(node.args) == 1, "Expected single argument for integer cast."
-            return arith.FPToSIOp(self._visit_math_nodes(dim, node.args[0], output_indexed), builtin.i64).result
+            return arith.FPToSIOp(self._visit_math_nodes(dim, node.args[0],
+                                  output_indexed), builtin.i64).result
         elif isinstance(node, floor):
             assert len(node.args) == 1, "Expected single argument for floor."
-            return math.FloorOp(self._visit_math_nodes(dim, node.args[0], output_indexed)).result
+            op = self._visit_math_nodes(dim, node.args[0], output_indexed)
+            return math.FloorOp(op).result
         elif isinstance(node, And):
-            SSAargs = (self._visit_math_nodes(dim, arg, output_indexed) for arg in node.args)
-            return reduce(lambda x,y : arith.AndI(x,y).result, SSAargs)
+            SSAargs = (self._visit_math_nodes(dim, arg, output_indexed)
+                       for arg in node.args)
+            return reduce(lambda x, y : arith.AndI(x, y).result, SSAargs)
         elif isinstance(node, Relational):
             if isinstance(node, GreaterThan):
                 mnemonic = "sge"
@@ -325,7 +310,7 @@ def _visit_math_nodes(self, dim: SteppingDimension, node: Expr,
         else:
             raise NotImplementedError(f"Unknown math:{type(node)} {node}", node)
 
-    def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None:
+    def build_stencil_step(self, dim: SteppingDimension, eq: LoweredEq) -> None:
         """
         Builds the body of the step function for a given dimension and equation.
 
@@ -345,7 +330,6 @@ def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None:
             elif isinstance(f.function, Function):
                 time_offset = 0
             else:
-                import pdb;pdb.set_trace()
                 raise NotImplementedError(f"reading function of type {type(f.function)} not supported")
             read_functions.add((f.function, time_offset))
 
@@ -375,7 +359,7 @@ def build_stencil_step(self, dim: SteppingDimension, eq:LoweredEq) -> None:
             assert "temp" in apply_op.name_hint
             apply_arg.name_hint = apply_op.name_hint.replace("temp", "blk")
 
-        self.apply_temps = {k:v for k,v in zip(read_functions, apply.region.block.args)}
+        self.apply_temps = {k: v for k, v in zip(read_functions, apply.region.block.args)}
         # Update the function values with the new temps
         self.function_values |= self.apply_temps
 
@@ -400,7 +384,7 @@ def build_generic_step_expression(self, dim: SteppingDimension, eq: LoweredEq):
         # Sources
         value = self._visit_math_nodes(dim, eq.rhs, None)
         temp = self.function_values[self.out_time_buffer]
-        memtemp = builtin.UnrealizedConversionCastOp.get([temp], [StencilToMemRefType(temp.type)]).results[0]
+        memtemp = UnrealizedConversionCastOp.get([temp], [StencilToMemRefType(temp.type)]).results[0]
         memtemp.name_hint = temp.name_hint + "_mem"
         indices = eq.lhs.indices
         if isinstance(eq.lhs.function, TimeFunction):
@@ -408,13 +392,17 @@ def build_generic_step_expression(self, dim: SteppingDimension, eq: LoweredEq):
         ssa_indices = [self._visit_math_nodes(dim, i, None) for i in indices]
         for i, ssa_i in enumerate(ssa_indices):
             if isinstance(ssa_i.type, builtin.IntegerType):
-                ssa_indices[i] = arith.IndexCastOp(ssa_i, builtin.IndexType())
+                ssa_indices[i] = arith.IndexCastOp(ssa_i, IndexType())
 
         match eq.operation:
             case None:
                 memref.Store.get(value, memtemp, ssa_indices)
-            case OpInc:
-                memref.AtomicRMWOp(operands=[value, memtemp, ssa_indices], result_types=[value.type], properties={"kind" : builtin.IntegerAttr(0, builtin.i64)})
+            case OpInc:  # noqa
+                # Maybe rename
+                attr = builtin.IntegerAttr(0, builtin.i64)
+                memref.AtomicRMWOp(operands=[value, memtemp, ssa_indices],
+                                   result_types=[value.type],
+                                   properties={"kind": attr})
 
     def build_condition(self, dim: SteppingDimension, eq: BooleanFunction):
         return self._visit_math_nodes(dim, eq, None)
@@ -436,12 +424,12 @@ def build_time_loop(
     ):
         # Bounds and step boilerpalte
         lb = iet_ssa.LoadSymbolic.get(
-            step_dim.symbolic_min._C_name, builtin.IndexType()
+            step_dim.symbolic_min._C_name, IndexType()
         )
         ub = iet_ssa.LoadSymbolic.get(
-            step_dim.symbolic_max._C_name, builtin.IndexType()
+            step_dim.symbolic_max._C_name, IndexType()
         )
-        one = arith.Constant.from_int_and_width(1, builtin.IndexType())
+        one = arith.Constant.from_int_and_width(1, IndexType())
         # Devito iterates from time_m to time_M *inclusive*, MLIR only takes
         # exclusive upper bounds, so we increment here.
         ub = arith.Addi(ub, one)
@@ -449,7 +437,7 @@ def build_time_loop(
         # Take the exact time_step from Devito
         try:
             step = arith.Constant.from_int_and_width(
-                int(step_dim.symbolic_incr), builtin.IndexType()
+                int(step_dim.symbolic_incr), IndexType()
             )
 
             step.result.name_hint = "step"
@@ -469,7 +457,7 @@ def build_time_loop(
             ub,
             step,
             iter_args,
-            Block(arg_types=[builtin.IndexType(), *(a.type for a in iter_args)]),
+            Block(arg_types=[IndexType(), *(a.type for a in iter_args)]),
         )
 
         # Name the 'time' step iterator
@@ -526,30 +514,45 @@ def _lower_injection(self, eqs: list[LoweredEq]):
         for interval in ispace[1:]:
             lower = interval.symbolic_min
             if isinstance(lower, Scalar):
-                lb = iet_ssa.LoadSymbolic.get(lower._C_name, builtin.IndexType())
+                lb = iet_ssa.LoadSymbolic.get(lower._C_name, IndexType())
             elif isinstance(lower, (Number, int)):
-                lb = arith.Constant.from_int_and_width(int(lower), builtin.IndexType())
+                lb = arith.Constant.from_int_and_width(int(lower), IndexType())
             else:
                 raise NotImplementedError(f"Lower bound of type {type(lower)} not supported")
-            lb.result.name_hint = f"{interval.dim.name}_m"
+            
+            try:
+                name = interval.dim.symbolic_min.name
+            except:
+                assert interval.dim.symbolic_min.is_integer
+                name = f"{interval.dim.name}_M"
+
+            lb.result.name_hint = name
 
             upper = interval.symbolic_max
             if isinstance(upper, Scalar):
-                ub = iet_ssa.LoadSymbolic.get(upper._C_name, builtin.IndexType())
+                ub = iet_ssa.LoadSymbolic.get(upper._C_name, IndexType())
             elif isinstance(upper, (Number, int)):
-                ub = arith.Constant.from_int_and_width(int(upper), builtin.IndexType())
+                ub = arith.Constant.from_int_and_width(int(upper), IndexType())
             else:
                 raise NotImplementedError(
                     f"Upper bound of type {type(upper)} not supported"
                 )
-            ub.result.name_hint = f"{interval.dim.name}_M"
+
+            try:
+                name = interval.dim.symbolic_max.name
+            except:
+                assert interval.dim.symbolic_max.is_integer
+                name = f"{interval.dim.name}_M"
+
+            ub.result.name_hint = name
+
             lbs.append(lb)
             ubs.append(ub)
 
-        steps = [arith.Constant.from_int_and_width(1, builtin.IndexType()).result]*len(ubs)
+        steps = [arith.Constant.from_int_and_width(1, IndexType()).result]*len(ubs)
         ubs = [arith.Addi(ub, steps[0]) for ub in ubs]
 
-        with ImplicitBuilder(scf.ParallelOp(lbs, ubs, steps, [pblock := Block(arg_types=[builtin.IndexType()]*len(ubs))]).body):
+        with ImplicitBuilder(scf.ParallelOp(lbs, ubs, steps, [pblock := Block(arg_types=[IndexType()]*len(ubs))]).body):
             for arg, interval in zip(pblock.args, ispace[1:], strict=True):
                 arg.name_hint = interval.dim.name
                 self.symbol_values[interval.dim.name] = arg
@@ -558,7 +561,7 @@ def _lower_injection(self, eqs: list[LoweredEq]):
             scf.Yield()
             # raise NotImplementedError("Injections not supported yet")
 
-    def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
+    def convert(self, eqs: Iterable[Eq], **kwargs) -> ModuleOp:
         """
         This converts a Devito Operator, represented here by a list of LoweredEqs, to
         an xDSL module defining a function implementing it.
@@ -575,7 +578,8 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
         their time sizes. Their sizes are deduced from the Grid.
         2. Create a time iteration loop, swapping buffers to implement time buffering.
 
-        NB: This needs to be converted to a Cluster conversion soon, which will be more sound.
+        NB: This needs to be converted to a Cluster conversion soon,
+            which will be more sound.
 
         ```mlir
         func.func @apply_kernel(%u_vec_0 : !stencil.field<[-1,4]xf32>, %u_vec_1 : !stencil.field<[-1,4]xf32>) {
@@ -598,10 +602,12 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
         Those represents runtime values not yet known that will be JIT-compiled when
         calling the operator.
         """
+
         # Instantiate the module.
-        self.function_values : dict[tuple[Function, int], SSAValue] = {}
-        self.symbol_values : dict[str, SSAValue] = {}
-        module = builtin.ModuleOp(Region([block := Block([])]))
+        self.function_values: dict[tuple[Function, int], SSAValue] = {}
+        self.symbol_values: dict[str, SSAValue] = {}
+        
+        module = ModuleOp(Region([block := Block([])]))
         with ImplicitBuilder(block):
             # Get all functions used in the equations
             functions = OrderedSet()
@@ -624,8 +630,8 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
                 else:
                     raise NotImplementedError(f"Expression {eq} of type {type(eq)} not supported")
 
-            self.time_buffers : list[TimeFunction] = []
-            self.functions : list[Function] = []
+            self.time_buffers: list[TimeFunction] = []
+            self.functions: list[Function] = []
             for f in functions:
                 match f:
                     case TimeFunction():
@@ -667,12 +673,13 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
             with ImplicitBuilder(xdsl_func.body.block):
 
                 # Get the stepping dimension, if there is any in the whole input
-                time_functions = [f for (f,_) in self.time_buffers]
+                time_functions = [f for (f, _) in self.time_buffers]
                 dimensions = {
                     d for f in (self.functions + time_functions) for d in f.dimensions
                 }
 
-                step_dim = next((d for d in dimensions if isinstance(d, SteppingDimension)), None)
+                step_dim = next((d for d in dimensions if
+                                 isinstance(d, SteppingDimension)), None)
                 if step_dim is not None:
                     self.build_time_loop(eqs, step_dim, **kwargs)
                 else:
@@ -686,7 +693,7 @@ def convert(self, eqs: Iterable[Eq], **kwargs) -> builtin.ModuleOp:
     def _ensure_same_type(self, *vals: SSAValue):
         if all(isinstance(val.type, builtin.IntegerAttr) for val in vals):
             return vals
-        if all(isinstance(val.type, builtin.IndexType) for val in vals):
+        if all(isinstance(val.type, IndexType) for val in vals):
             # Sources
             return vals
         if all(is_float(val) for val in vals):
@@ -700,7 +707,7 @@ def _ensure_same_type(self, *vals: SSAValue):
             if cast_to_floats and is_float(val):
                 processed.append(val)
                 continue
-            if (not cast_to_floats) and isinstance(val.type, builtin.IndexType):
+            if (not cast_to_floats) and isinstance(val.type, IndexType):
                 processed.append(val)
                 continue
             # if the val is the result of a arith.constant with no uses,
@@ -716,17 +723,17 @@ def _ensure_same_type(self, *vals: SSAValue):
                         float(val.op.value.value.data), builtin.f32
                     )
                 else:
-                    val.type = builtin.IndexType()
-                    val.op.value.type = builtin.IndexType()
+                    val.type = IndexType()
+                    val.op.value.type = IndexType()
                 processed.append(val)
                 continue
             # insert a cast op
             if cast_to_floats:
-                if val.type == builtin.IndexType():
+                if val.type == IndexType():
                     val = arith.IndexCastOp(val, builtin.i64).result
                 conv = arith.SIToFPOp(val, builtin.f32)
             else:
-                conv = arith.IndexCastOp(val, builtin.IndexType())
+                conv = arith.IndexCastOp(val, IndexType())
             processed.append(conv.result)
         return processed
 
@@ -754,7 +761,7 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
             return
         self.done = True
 
-        op.sym_name = builtin.StringAttr("gpu_kernel")
+        op.sym_name = StringAttr("gpu_kernel")
         print("Doing GPU STUFF")
         # GPU STUFF
         wrapper = func.FuncOp(self.func_name, op.function_type, Region(Block([func.Return()], arg_types=op.function_type.inputs)))
@@ -762,12 +769,12 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
         wrapper.body.block.insert_op_before(func.Call("gpu_kernel", body.args, []), body.last_op)
         for arg in wrapper.args:
             shapetype = arg.type
-            if isinstance(shapetype, stencil.FieldType):
+            if isinstance(shapetype, stencil.FieldType):              
                 memref_type = memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape())
                 alloc = gpu.AllocOp(memref.MemRefType.from_element_type_and_shape(shapetype.get_element_type(), shapetype.get_shape()))
-                outcast = builtin.UnrealizedConversionCastOp.get(alloc, shapetype)
+                outcast = UnrealizedConversionCastOp.get(alloc, shapetype)
                 arg.replace_by(outcast.results[0])
-                incast = builtin.UnrealizedConversionCastOp.get(arg, memref_type)
+                incast = UnrealizedConversionCastOp.get(arg, memref_type)
                 copy = gpu.MemcpyOp(source=incast, destination=alloc)
                 body.insert_ops_before([alloc, outcast, incast, copy], body.ops.first)
 
@@ -777,49 +784,6 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
         rewriter.insert_op_after_matched_op(wrapper)
 
 
-class TimerRewritePattern(RewritePattern):
-    """
-    Base class for time benchmarking related rewrite patterns
-    """
-    pass
-
-
-@dataclass
-class MakeFunctionTimed(TimerRewritePattern):
-    """
-    Populate the section0 devito timer with the total runtime of the function
-    """
-    func_name: str
-    seen_ops: set[func.Func] = field(default_factory=set)
-
-    @op_type_rewrite_pattern
-    def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
-        if op.sym_name.data != self.func_name or op in self.seen_ops:
-            return
-
-        # only apply once
-        self.seen_ops.add(op)
-
-        # Insert timer start and end calls
-        rewriter.insert_op([
-            t0 := func.Call('timer_start', [], [builtin.f64])
-        ], InsertPoint.at_start(op.body.block))
-
-        ret = op.get_return_op()
-        assert ret is not None
-
-        rewriter.insert_op_before([
-            timers := iet_ssa.LoadSymbolic.get('timers', llvm.LLVMPointerType.opaque()),
-            t1 := func.Call('timer_end', [t0], [builtin.f64]),
-            llvm.StoreOp(t1, timers),
-        ], ret)
-
-        rewriter.insert_op([
-            func.FuncOp.external('timer_start', [], [builtin.f64]),
-            func.FuncOp.external('timer_end', [builtin.f64], [builtin.f64]),
-        ], InsertPoint.after(rewriter.current_operation))
-
-
 def get_containing_func(op: Operation) -> func.FuncOp | None:
     while op is not None and not isinstance(op, func.FuncOp):
         op = op.parent_op()
@@ -887,7 +851,7 @@ def match_and_rewrite(self, op: iet_ssa.LoadSymbolic, rewriter: PatternRewriter,
         parent.update_function_type()
 
 
-def finalize_module_with_globals(module: builtin.ModuleOp, known_symbols: dict[str, Any],
+def finalize_module_with_globals(module: ModuleOp, known_symbols: dict[str, Any],
                                  gpu_boilerplate):
     """
     This function finalizes a module by replacing all symbolic constants with their
diff --git a/devito/ir/ietxdsl/iet_ssa.py b/devito/ir/xdsl_iet/iet_ssa.py
similarity index 100%
rename from devito/ir/ietxdsl/iet_ssa.py
rename to devito/ir/xdsl_iet/iet_ssa.py
diff --git a/devito/ir/ietxdsl/profiling.py b/devito/ir/xdsl_iet/profiling.py
similarity index 98%
rename from devito/ir/ietxdsl/profiling.py
rename to devito/ir/xdsl_iet/profiling.py
index f767a34ff1..1b434d78e0 100644
--- a/devito/ir/ietxdsl/profiling.py
+++ b/devito/ir/xdsl_iet/profiling.py
@@ -1,7 +1,7 @@
 
 from dataclasses import dataclass, field
 
-from devito.ir.ietxdsl import iet_ssa
+from devito.ir.xdsl_iet import iet_ssa
 from xdsl.dialects import builtin, func, llvm
 
 from xdsl.pattern_rewriter import (RewritePattern, op_type_rewrite_pattern,
diff --git a/devito/ir/ietxdsl/utils.py b/devito/ir/xdsl_iet/utils.py
similarity index 100%
rename from devito/ir/ietxdsl/utils.py
rename to devito/ir/xdsl_iet/utils.py
diff --git a/devito/operator/operator.py b/devito/operator/operator.py
index 1c828a88a0..0c66f74ed8 100644
--- a/devito/operator/operator.py
+++ b/devito/operator/operator.py
@@ -271,6 +271,7 @@ def _lower(cls, expressions, **kwargs):
     def _rcompile_wrapper(cls, **kwargs0):
         def wrapper(expressions, **kwargs1):
             return rcompile(expressions, {**kwargs0, **kwargs1})
+
         return wrapper
 
     @classmethod
@@ -837,7 +838,6 @@ def apply(self, **kwargs):
 
         # Invoke kernel function with args
         arg_values = [args[p.name] for p in self.parameters]
-
         try:
             cfunction = self.cfunction
             with self._profiler.timer_on('apply', comm=args.comm):
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 5f66ed1c99..102bfe3a75 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -1366,7 +1366,6 @@ def free_symbols(self):
 
 
 class IndexedData(IndexedBase):
-
     pass
 
 
diff --git a/devito/xdsl_core/__init__.py b/devito/xdsl_core/__init__.py
new file mode 100644
index 0000000000..3f0f0bffc8
--- /dev/null
+++ b/devito/xdsl_core/__init__.py
@@ -0,0 +1,4 @@
+from .xdsl_cpu import *
+from .xdsl_gpu import *
+
+# flake8: noqa
\ No newline at end of file
diff --git a/devito/xdsl_core/utils.py b/devito/xdsl_core/utils.py
new file mode 100644
index 0000000000..218e4df827
--- /dev/null
+++ b/devito/xdsl_core/utils.py
@@ -0,0 +1,13 @@
+
+from typing import Iterable
+
+
+def generate_pipeline(passes: Iterable[str]):
+    'Generate a pipeline string from a list of passes'
+    passes_string = ",".join(passes)
+    return f'"{passes_string}"'
+
+
+def generate_mlir_pipeline(passes: Iterable[str]):
+    passes_string = ",".join(passes)
+    return f'mlir-opt[{passes_string}]'
diff --git a/devito/core/cpu_xdsl.py b/devito/xdsl_core/xdsl_cpu.py
similarity index 97%
rename from devito/core/cpu_xdsl.py
rename to devito/xdsl_core/xdsl_cpu.py
index 6c4414a63a..24e5d38ce5 100644
--- a/devito/core/cpu_xdsl.py
+++ b/devito/xdsl_core/xdsl_cpu.py
@@ -9,8 +9,6 @@
 
 from io import StringIO
 
-from typing import Iterable
-
 from devito.core.operator import CoreOperator
 from devito.ir.iet import Callable, MetaCall
 from devito.ir.iet.nodes import Section
@@ -23,13 +21,14 @@
 from xdsl.printer import Printer
 from xdsl.xdsl_opt_main import xDSLOptMain
 
-from devito.ir.ietxdsl.cluster_to_ssa import (ExtractDevitoStencilConversion,
-                                              finalize_module_with_globals,
-                                              setup_memref_args)  # noqa
+from devito.ir.xdsl_iet.cluster_to_ssa import (ExtractDevitoStencilConversion,
+                                               finalize_module_with_globals,
+                                               setup_memref_args)  # noqa
 
-from devito.ir.ietxdsl.profiling import apply_timers
+from devito.ir.xdsl_iet.profiling import apply_timers
 from devito.passes.iet import CTarget, OmpTarget
 from devito.core.cpu import Cpu64OperatorMixin
+from devito.xdsl_core.utils import generate_pipeline, generate_mlir_pipeline
 
 
 __all__ = ['XdslnoopOperator', 'XdslAdvOperator']
@@ -638,17 +637,6 @@ def generate_XDSL_MPI_PIPELINE(decomp, nb_tiled_dims):
     return generate_pipeline(passes)
 
 
-def generate_pipeline(passes: Iterable[str]):
-    'Generate a pipeline string from a list of passes'
-    passes_string = ",".join(passes)
-    return f'"{passes_string}"'
-
-
-def generate_mlir_pipeline(passes: Iterable[str]):
-    passes_string = ",".join(passes)
-    return f'mlir-opt[{passes_string}]'
-
-
 # small interop shim script for stuff that we don't want to implement in mlir-ir
 _INTEROP_C = """
 #include <time.h>
diff --git a/devito/xdsl_core/xdsl_gpu.py b/devito/xdsl_core/xdsl_gpu.py
new file mode 100644
index 0000000000..87a1bc9b8a
--- /dev/null
+++ b/devito/xdsl_core/xdsl_gpu.py
@@ -0,0 +1,192 @@
+from contextlib import redirect_stdout
+import io
+import os
+import sys
+from io import StringIO
+
+from devito.arch.archinfo import get_nvidia_cc
+
+from devito.xdsl_core.xdsl_cpu import XdslAdvOperator
+
+from devito.ir.xdsl_iet.cluster_to_ssa import finalize_module_with_globals
+from devito.mpi import MPI
+
+from devito.logger import info, perf
+
+from xdsl.printer import Printer
+from xdsl.xdsl_opt_main import xDSLOptMain
+from devito.passes.iet import DeviceOmpTarget
+from devito.xdsl_core.utils import generate_pipeline, generate_mlir_pipeline
+
+
+__all__ = ['XdslAdvDeviceOperator']
+
+
+class XdslAdvDeviceOperator(XdslAdvOperator):
+
+    _Target = DeviceOmpTarget
+
+    def _jit_compile(self):
+        """
+        JIT-compile the C code generated by the Operator.
+        It is ensured that JIT compilation will only be performed
+        once per Operator, reagardless of how many times this method
+        is invoked.
+        """
+        with self._profiler.timer_on('jit-compile'):
+            is_mpi = MPI.Is_initialized()
+            is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX'
+
+            if is_mpi and is_gpu:
+                raise RuntimeError("Cannot run MPI+GPU for now!")
+
+            # specialize the code for the specific apply parameters
+            finalize_module_with_globals(self._module, self._jit_kernel_constants,
+                                         gpu_boilerplate=is_gpu)
+
+            # print module as IR
+            module_str = StringIO()
+            Printer(stream=module_str).print(self._module)
+            module_str = module_str.getvalue()
+
+            xdsl_pipeline = generate_XDSL_GPU_PIPELINE()
+            # Get GPU blocking shapes
+            block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])]  # noqa
+            block_sizes = ','.join(str(bs) for bs in block_sizes)
+            mlir_pipeline = generate_MLIR_GPU_PIPELINE(block_sizes)
+
+            # allow jit backdooring to provide your own xdsl code
+            backdoor = os.getenv('XDSL_JIT_BACKDOOR')
+            if backdoor is not None:
+                if os.path.splitext(backdoor)[1] == ".so":
+                    info(f"JIT Backdoor: skipping compilation and using {backdoor}")
+                    self._tf.name = backdoor
+                    return
+                print("JIT Backdoor: loading xdsl file from: " + backdoor)
+                with open(backdoor, 'r') as f:
+                    module_str = f.read()
+
+            # Uncomment to print the module_str
+            # Printer().print(module_str)
+            source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
+            source_file = open(source_name, "w")
+            source_file.write(module_str)
+            source_file.close()
+
+            # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
+            cflags = "-O3 -march=native -mtune=native -lmlir_c_runner_utils"
+
+            try:
+                cc = "clang"
+
+                cflags += " -lmlir_cuda_runtime "
+                cflags += " -shared "
+
+                # TODO More detailed error handling manually,
+                # instead of relying on a bash-only feature.
+
+                # xdsl-opt, get xDSL IR
+                # TODO: Remove quotes in pipeline; currently workaround with [1:-1]
+                # Run the first pipeline, mostly xDSL-centric
+                xdsl_args = [source_name,
+                             "--allow-unregistered-dialect",
+                             "-p",
+                             xdsl_pipeline[1:-1],]
+                # We use the Python API to run xDSL rather than a subprocess
+                # This avoids reimport overhead
+                xdsl = xDSLOptMain(args=xdsl_args)
+                out = io.StringIO()
+                perf("-----------------")
+                perf(f"xdsl-opt {' '.join(xdsl_args)}")
+                with redirect_stdout(out):
+                    xdsl.run()
+
+                # To use as input in the next stage
+                out.seek(0)
+                # Run the second pipeline, mostly MLIR-centric
+                xdsl_mlir_args = ["--allow-unregistered-dialect",
+                                  "-p",
+                                  mlir_pipeline]
+                # We drive it though xDSL rather than a mlir-opt call for:
+                # - ability to use xDSL replacement passes in the middle
+                # - Avoiding complex process cmanagement code here: xDSL provides
+                xdsl = xDSLOptMain(args=xdsl_mlir_args)
+                out2 = io.StringIO()
+                perf("-----------------")
+                perf(f"xdsl-opt {' '.join(xdsl_mlir_args)}")
+                with redirect_stdout(out2):
+                    old_stdin = sys.stdin
+                    sys.stdin = out
+                    xdsl.run()
+                    sys.stdin = old_stdin
+
+                # mlir-translate to translate to LLVM-IR
+                mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir'
+                out = self.compile(mlir_translate_cmd, out2.getvalue())
+
+                # Compile with clang and get LLVM-IR
+                clang_cmd = f'{cc} {cflags} -o {self._tf.name} {self._interop_tf.name} -xir -'  # noqa
+                out = self.compile(clang_cmd, out)
+
+            except Exception as ex:
+                print("error")
+                raise ex
+
+        elapsed = self._profiler.py_timers['jit-compile']
+
+        perf("XDSLAdvDeviceOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
+             (self.name, source_name, elapsed))
+
+
+def generate_XDSL_GPU_PIPELINE():
+    passes = [
+        "stencil-shape-inference",
+        "convert-stencil-to-ll-mlir",
+        "reconcile-unrealized-casts",
+        "printf-to-llvm",
+        "canonicalize"
+    ]
+
+    return generate_pipeline(passes)
+
+
+# gpu-launch-sink-index-computations seemed to have no impact
+def generate_MLIR_GPU_PIPELINE(block_sizes):
+    return generate_pipeline([
+        generate_mlir_pipeline([
+            "test-math-algebraic-simplification",
+            f"scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}}",
+        ]),
+        "gpu-map-parallel-loops",
+        generate_mlir_pipeline([
+            "convert-parallel-loops-to-gpu",
+            "lower-affine",
+            "canonicalize",
+            "cse",
+            "fold-memref-alias-ops",
+            "gpu-launch-sink-index-computations",
+            "gpu-kernel-outlining",
+            "canonicalize{region-simplify}",
+            "cse",
+            "fold-memref-alias-ops",
+            "expand-strided-metadata",
+            "lower-affine",
+            "canonicalize",
+            "cse",
+            "func.func(gpu-async-region)",
+            "canonicalize",
+            "cse",
+            "convert-arith-to-llvm{index-bitwidth=64}",
+            "convert-scf-to-cf",
+            "convert-cf-to-llvm{index-bitwidth=64}",
+            "canonicalize",
+            "cse",
+            "convert-func-to-llvm{use-bare-ptr-memref-call-conv}",
+            f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}",
+            "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)",
+            "gpu-to-llvm",
+            "gpu-module-to-binary",
+            "canonicalize",
+            "cse"
+        ]),
+    ])[1:-1]