Added test, fixes

intel · Sep 19, 2024 · 34415b1 · 34415b1
1 parent c4d48f9
commit 34415b1
Show file tree

Hide file tree

Showing 9 changed files with 132 additions and 5 deletions.
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -94,6 +94,13 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
   ];
 }
 
+def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> {
+  let summary = "Add a context argument.";
+  let description = [{
+    Add a new memref argument to the function, that could be used to pass some context.
+  }];
+}
+
 def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
   let summary = "Convert the GPU operations to GpuOclRuntime calls.";
   let description = [{

diff --git a/lib/gc/Transforms/GPU/AddContextArg.cpp b/lib/gc/Transforms/GPU/AddContextArg.cpp
@@ -0,0 +1,45 @@
+//===-- AddContextArg.cpp - Add context argument ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+namespace mlir::gc {
+#define GEN_PASS_DECL_ADDCONTEXTARG
+#define GEN_PASS_DEF_ADDCONTEXTARG
+#include "gc/Transforms/Passes.h.inc"
+} // namespace mlir::gc
+
+using namespace mlir;
+
+namespace {
+struct AddContextArg final : gc::impl::AddContextArgBase<AddContextArg> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto funcType = func.getFunctionType();
+    auto argTypes = llvm::to_vector<8>(funcType.getInputs());
+    auto resultTypes = llvm::to_vector<1>(funcType.getResults());
+    auto ctx = func->getContext();
+    auto newArgType = MemRefType::get({}, IntegerType::get(ctx, 8));
+    argTypes.emplace_back(newArgType);
+    auto newFuncType = FunctionType::get(ctx, argTypes, resultTypes);
+    func.setType(newFuncType);
+
+    if (func.getBody().hasOneBlock()) {
+      func.getBody().front().addArgument(newArgType, func.getLoc());
+    }
+
+    // Find all function calls and append the last argument of the current
+    // function to the call.
+    func.walk([&](func::CallOp call) {
+      auto args = llvm::to_vector<8>(call.getOperands());
+      args.emplace_back(func.getArgument(func.getNumArguments() - 1));
+      call->setOperands(args);
+    });
+  }
+};
+} // namespace
diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt
@@ -1,4 +1,5 @@
 gc_add_mlir_library(GcGpuPasses
+  AddContextArg.cpp
   GpuToGpuOcl.cpp
   LinalgToXeGPU.cpp
   Pipeline.cpp

diff --git a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
@@ -316,6 +316,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
           loc, getFuncName,
           LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}),
           LLVM::Linkage::Internal);
+      function.setAlwaysInline(true);
       rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter));
 
       auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));

diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp
@@ -39,6 +39,9 @@
 namespace mlir::gc {
 
 void populateGPUPipeline(mlir::OpPassManager &pm) {
+  // Add an argument for the GPU context
+  pm.addNestedPass<func::FuncOp>(createAddContextArg());
+
   pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
 
   pm.addPass(bufferization::createEmptyTensorEliminationPass());
@@ -76,7 +79,8 @@ void populateGPUPipeline(mlir::OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
   pm.addNestedPass<func::FuncOp>(createParallelLoopToGpuPass());
 
-  pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
+  // Temporary disabled until #344 is implemented
+  // pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
   pm.addPass(createGpuKernelOutliningPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(imex::createSetSPIRVCapabilitiesPass());
@@ -95,15 +99,16 @@ void populateGPUPipeline(mlir::OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(LLVM::createRequestCWrappersPass());
   pm.addPass(imex::createSerializeSPIRVPass());
   pm.addPass(createConvertVectorToSCFPass());
-  pm.addPass(imex::createConvertGPUToGPUXPass());
+  // pm.addPass(imex::createConvertGPUToGPUXPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertControlFlowToLLVMPass());
   pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertIndexToLLVMPass());
   pm.addPass(createArithToLLVMConversionPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(createConvertMathToLLVMPass());
-  pm.addPass(imex::createConvertGPUXToLLVMPass());
+  // pm.addPass(imex::createConvertGPUXToLLVMPass());
+  pm.addPass(createGpuToGpuOcl());
   pm.addPass(createConvertIndexToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
   pm.addPass(createLowerAffinePass());

diff --git a/lib/gc/Transforms/IterativeTilingAndFusion.cpp b/lib/gc/Transforms/IterativeTilingAndFusion.cpp
@@ -680,7 +680,7 @@ defaultTilingOfType(RewriterBase &rewriter, Operation *op,
   } else {
     defaultTileSize.resize(iteratorTypes.size(), rewriter.getIndexAttr(0));
     // Try tileSize from `32` to `16`.
-    SmallVector<int64_t> tsOrder = {32, 16};
+    SmallVector<int64_t> tsOrder = {16, 32};
     // Record how many dims have been tiled, including fully tiled, i.e.
     // tileSize == dimSize.
     unsigned nonOneTileDims =

diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg b/test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+# GPUX is currently disabled
+config.unsupported = True
diff --git a/test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir b/test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir
@@ -0,0 +1,63 @@
+// RUN: gc-opt %s --gc-gpu-pipeline | FileCheck %s
+
+module @test {
+  func.func @entry(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<32x32xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<32x32xf32>
+    %2 = tensor.empty() : tensor<32x32xf32>
+    %3 = linalg.add ins(%1, %0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%2 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+    return
+  }
+}
+
+// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
+// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
+// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
+
+// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
+// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
+// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
+// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
+// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
+// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
+// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
+// CHECK: [[BB1]]:
+// CHECK: llvm.return [[NEW_PTR]]
+// CHECK: [[BB2]]:
+// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
+// CHECK: llvm.store [[NEW_PTR]], [[ARRAY]]
+// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
+// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
+// CHECK: llvm.return [[OLD_PTR]]
+
+// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
+// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
+// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
+// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
+// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
+// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
+// CHECK: [[BB1]]:
+// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
+// CHECK: llvm.return [[NEW_PTR]]
+// CHECK: [[BB2]]:
+// CHECK: llvm.return [[PTR]]
+
+// CHECK: llvm.func @entry
+// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
+// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],
+
+// CHECK: llvm.func @gcGpuOclKernelCreate
+// CHECK: llvm.func @gcGpuOclKernelDestroy
+// CHECK: llvm.func @gcGpuOclKernelLaunch
+
+
+// CHECK: llvm.func @gcGpuOclModuleDestructor()
+// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
+// CHECK: llvm.fence acquire
+// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
+// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
+// CHECK: llvm.store [[PTR]], [[ARRAY]]
+// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
diff --git a/test/mlir/test/gc/gpu-runner/lit.local.cfg b/test/mlir/test/gc/gpu-runner/lit.local.cfg
@@ -1,2 +1,5 @@
 if not config.gc_use_imex:
-    config.unsupported = True
+    config.unsupported = True
+else:
+    # FIXME: Enable when the GPU runner is implemented.
+    config.excludes = ['mlp.mlir']