Skip to content

Commit

Permalink
Merge branch 'master' into dependabot/pip/distributed-lt-2024.7
Browse files Browse the repository at this point in the history
  • Loading branch information
georgebisbas authored Jun 19, 2024
2 parents b3393fc + a0d100a commit 7a9354d
Show file tree
Hide file tree
Showing 16 changed files with 319 additions and 344 deletions.
2 changes: 0 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# Please see the documentation for all configuration options:
# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates

# For Devito-xdsl fork

version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
Expand Down
8 changes: 5 additions & 3 deletions devito/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
Cpu64FsgCOperator, Cpu64FsgOmpOperator,
Cpu64CustomOperator)

from devito.core.cpu_xdsl import XdslnoopOperator, XdslAdvOperator
from devito.core.intel import (Intel64AdvCOperator, Intel64AdvOmpOperator,
Intel64FsgCOperator, Intel64FsgOmpOperator)
from devito.core.arm import ArmAdvCOperator, ArmAdvOmpOperator
from devito.core.power import PowerAdvCOperator, PowerAdvOmpOperator
from devito.core.gpu import (DeviceNoopOmpOperator, DeviceNoopAccOperator,
DeviceAdvOmpOperator, DeviceAdvAccOperator,
DeviceFsgOmpOperator, DeviceFsgAccOperator,
DeviceCustomOmpOperator, DeviceCustomAccOperator,
XdslAdvDeviceOperator)
DeviceCustomOmpOperator, DeviceCustomAccOperator)
from devito.operator.registry import operator_registry

# Import XDSL Operators
from devito.xdsl_core.xdsl_cpu import XdslnoopOperator, XdslAdvOperator
from devito.xdsl_core.xdsl_gpu import XdslAdvDeviceOperator

# Register CPU Operators
operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'C')
operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'openmp')
Expand Down
9 changes: 4 additions & 5 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
from devito.exceptions import InvalidOperator
from devito.passes.equations import collect_derivatives
from devito.passes.clusters import (Lift, blocking, buffering, cire, cse,
factorize, fission, fuse, optimize_pows,
optimize_hyperplanes)
from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, linearize, mpiize,
hoist_prodders, relax_incr_dimensions)
factorize, fission, fuse, optimize_hyperplanes,
optimize_pows)
from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, hoist_prodders,
linearize, mpiize, relax_incr_dimensions)
from devito.tools import timed_pass


__all__ = ['Cpu64NoopCOperator', 'Cpu64NoopOmpOperator', 'Cpu64AdvCOperator',
'Cpu64AdvOmpOperator', 'Cpu64FsgCOperator', 'Cpu64FsgOmpOperator',
'Cpu64CustomOperator']
Expand Down
186 changes: 1 addition & 185 deletions devito/core/gpu.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
from contextlib import redirect_stdout
import io
import os
import sys
from functools import partial
from io import StringIO

import numpy as np
from devito.arch.archinfo import get_nvidia_cc

from devito.core.operator import CoreOperator, CustomOperator, ParTile

from devito.core.cpu_xdsl import (XdslAdvOperator, generate_mlir_pipeline,
generate_pipeline)
from devito.core.operator import CoreOperator, CustomOperator, ParTile

from devito.exceptions import InvalidOperator
from devito.operator.operator import rcompile
Expand All @@ -22,15 +14,9 @@
optimize_pows)
from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, hoist_prodders,
linearize, pthreadify, relax_incr_dimensions)
from devito.logger import info, perf
from devito.mpi import MPI

from devito.tools import as_tuple, timed_pass

from xdsl.printer import Printer
from xdsl.xdsl_opt_main import xDSLOptMain

from devito.ir.ietxdsl.cluster_to_ssa import finalize_module_with_globals

__all__ = ['DeviceNoopOperator', 'DeviceAdvOperator', 'DeviceCustomOperator',
'DeviceNoopOmpOperator', 'DeviceAdvOmpOperator', 'DeviceFsgOmpOperator',
Expand Down Expand Up @@ -376,122 +362,6 @@ def _make_iet_passes_mapper(cls, **kwargs):
return mapper


class XdslAdvDeviceOperator(XdslAdvOperator):

_Target = DeviceOmpTarget

def _jit_compile(self):
"""
JIT-compile the C code generated by the Operator.
It is ensured that JIT compilation will only be performed
once per Operator, reagardless of how many times this method
is invoked.
"""
with self._profiler.timer_on('jit-compile'):
is_mpi = MPI.Is_initialized()
is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX'

if is_mpi and is_gpu:
raise RuntimeError("Cannot run MPI+GPU for now!")

# specialize the code for the specific apply parameters
finalize_module_with_globals(self._module, self._jit_kernel_constants,
gpu_boilerplate=is_gpu)

# print module as IR
module_str = StringIO()
Printer(stream=module_str).print(self._module)
module_str = module_str.getvalue()

xdsl_pipeline = generate_XDSL_GPU_PIPELINE()
# Get GPU blocking shapes
block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])] # noqa
block_sizes = ','.join(str(bs) for bs in block_sizes)
mlir_pipeline = generate_MLIR_GPU_PIPELINE(block_sizes)

# allow jit backdooring to provide your own xdsl code
backdoor = os.getenv('XDSL_JIT_BACKDOOR')
if backdoor is not None:
if os.path.splitext(backdoor)[1] == ".so":
info(f"JIT Backdoor: skipping compilation and using {backdoor}")
self._tf.name = backdoor
return
print("JIT Backdoor: loading xdsl file from: " + backdoor)
with open(backdoor, 'r') as f:
module_str = f.read()

# Uncomment to print the module_str
# Printer().print(module_str)
source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
source_file = open(source_name, "w")
source_file.write(module_str)
source_file.close()

# Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
cflags = "-O3 -march=native -mtune=native -lmlir_c_runner_utils"

try:
cc = "clang"

cflags += " -lmlir_cuda_runtime "
cflags += " -shared "

# TODO More detailed error handling manually,
# instead of relying on a bash-only feature.

# xdsl-opt, get xDSL IR
# TODO: Remove quotes in pipeline; currently workaround with [1:-1]
# Run the first pipeline, mostly xDSL-centric
xdsl_args = [source_name,
"--allow-unregistered-dialect",
"-p",
xdsl_pipeline[1:-1],]
# We use the Python API to run xDSL rather than a subprocess
# This avoids reimport overhead
xdsl = xDSLOptMain(args=xdsl_args)
out = io.StringIO()
perf("-----------------")
perf(f"xdsl-opt {' '.join(xdsl_args)}")
with redirect_stdout(out):
xdsl.run()

# To use as input in the next stage
out.seek(0)
# Run the second pipeline, mostly MLIR-centric
xdsl_mlir_args = ["--allow-unregistered-dialect",
"-p",
mlir_pipeline]
# We drive it though xDSL rather than a mlir-opt call for:
# - ability to use xDSL replacement passes in the middle
# - Avoiding complex process cmanagement code here: xDSL provides
xdsl = xDSLOptMain(args=xdsl_mlir_args)
out2 = io.StringIO()
perf("-----------------")
perf(f"xdsl-opt {' '.join(xdsl_mlir_args)}")
with redirect_stdout(out2):
old_stdin = sys.stdin
sys.stdin = out
xdsl.run()
sys.stdin = old_stdin

# mlir-translate to translate to LLVM-IR
mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir'
out = self.compile(mlir_translate_cmd, out2.getvalue())

# Compile with clang and get LLVM-IR
clang_cmd = f'{cc} {cflags} -o {self._tf.name} {self._interop_tf.name} -xir -' # noqa
out = self.compile(clang_cmd, out)

except Exception as ex:
print("error")
raise ex

elapsed = self._profiler.py_timers['jit-compile']

perf("XDSLAdvDeviceOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
(self.name, source_name, elapsed))


# OpenACC

class DeviceAccOperatorMixin(object):
Expand Down Expand Up @@ -569,57 +439,3 @@ def reads_if_on_host(c):
return set()

return runs_on_host, reads_if_on_host


def generate_XDSL_GPU_PIPELINE():
passes = [
"stencil-shape-inference",
"convert-stencil-to-ll-mlir",
"reconcile-unrealized-casts",
"printf-to-llvm",
"canonicalize"
]

return generate_pipeline(passes)


# gpu-launch-sink-index-computations seemed to have no impact
def generate_MLIR_GPU_PIPELINE(block_sizes):
return generate_pipeline([
generate_mlir_pipeline([
"test-math-algebraic-simplification",
f"scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}}",
]),
"gpu-map-parallel-loops",
generate_mlir_pipeline([
"convert-parallel-loops-to-gpu",
"lower-affine",
"canonicalize",
"cse",
"fold-memref-alias-ops",
"gpu-launch-sink-index-computations",
"gpu-kernel-outlining",
"canonicalize{region-simplify}",
"cse",
"fold-memref-alias-ops",
"expand-strided-metadata",
"lower-affine",
"canonicalize",
"cse",
"func.func(gpu-async-region)",
"canonicalize",
"cse",
"convert-arith-to-llvm{index-bitwidth=64}",
"convert-scf-to-cf",
"convert-cf-to-llvm{index-bitwidth=64}",
"canonicalize",
"cse",
"convert-func-to-llvm{use-bare-ptr-memref-call-conv}",
f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}",
"gpu.module(convert-gpu-to-nvvm,canonicalize,cse)",
"gpu-to-llvm",
"gpu-module-to-binary",
"canonicalize",
"cse"
]),
])[1:-1]
5 changes: 0 additions & 5 deletions devito/ir/ietxdsl/__init__.py

This file was deleted.

5 changes: 5 additions & 0 deletions devito/ir/xdsl_iet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from devito.ir.xdsl_iet.cluster_to_ssa import finalize_module_with_globals

from devito.ir.xdsl_iet.profiling import apply_timers

# flake8: noqa
Loading

0 comments on commit 7a9354d

Please sign in to comment.