Skip to content

Commit

Permalink
Added test, fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreyPavlenko committed Sep 19, 2024
1 parent c4d48f9 commit 34415b1
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 5 deletions.
7 changes: 7 additions & 0 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
];
}

def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> {
let summary = "Add a context argument.";
let description = [{
Add a new memref argument to the function, that could be used to pass some context.
}];
}

def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
let summary = "Convert the GPU operations to GpuOclRuntime calls.";
let description = [{
Expand Down
45 changes: 45 additions & 0 deletions lib/gc/Transforms/GPU/AddContextArg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===-- AddContextArg.cpp - Add context argument ----------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/Passes.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"

namespace mlir::gc {
#define GEN_PASS_DECL_ADDCONTEXTARG
#define GEN_PASS_DEF_ADDCONTEXTARG
#include "gc/Transforms/Passes.h.inc"
} // namespace mlir::gc

using namespace mlir;

namespace {
struct AddContextArg final : gc::impl::AddContextArgBase<AddContextArg> {
void runOnOperation() override {
auto func = getOperation();
auto funcType = func.getFunctionType();
auto argTypes = llvm::to_vector<8>(funcType.getInputs());
auto resultTypes = llvm::to_vector<1>(funcType.getResults());
auto ctx = func->getContext();
auto newArgType = MemRefType::get({}, IntegerType::get(ctx, 8));
argTypes.emplace_back(newArgType);
auto newFuncType = FunctionType::get(ctx, argTypes, resultTypes);
func.setType(newFuncType);

if (func.getBody().hasOneBlock()) {
func.getBody().front().addArgument(newArgType, func.getLoc());
}

// Find all function calls and append the last argument of the current
// function to the call.
func.walk([&](func::CallOp call) {
auto args = llvm::to_vector<8>(call.getOperands());
args.emplace_back(func.getArgument(func.getNumArguments() - 1));
call->setOperands(args);
});
}
};
} // namespace
1 change: 1 addition & 0 deletions lib/gc/Transforms/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
gc_add_mlir_library(GcGpuPasses
AddContextArg.cpp
GpuToGpuOcl.cpp
LinalgToXeGPU.cpp
Pipeline.cpp
Expand Down
1 change: 1 addition & 0 deletions lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
loc, getFuncName,
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}),
LLVM::Linkage::Internal);
function.setAlwaysInline(true);
rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter));

auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));
Expand Down
11 changes: 8 additions & 3 deletions lib/gc/Transforms/GPU/Pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
namespace mlir::gc {

void populateGPUPipeline(mlir::OpPassManager &pm) {
// Add an argument for the GPU context
pm.addNestedPass<func::FuncOp>(createAddContextArg());

pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());

pm.addPass(bufferization::createEmptyTensorEliminationPass());
Expand Down Expand Up @@ -76,7 +79,8 @@ void populateGPUPipeline(mlir::OpPassManager &pm) {
pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
pm.addNestedPass<func::FuncOp>(createParallelLoopToGpuPass());

pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
// Temporary disabled until #344 is implemented
// pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
pm.addPass(createGpuKernelOutliningPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(imex::createSetSPIRVCapabilitiesPass());
Expand All @@ -95,15 +99,16 @@ void populateGPUPipeline(mlir::OpPassManager &pm) {
pm.addNestedPass<func::FuncOp>(LLVM::createRequestCWrappersPass());
pm.addPass(imex::createSerializeSPIRVPass());
pm.addPass(createConvertVectorToSCFPass());
pm.addPass(imex::createConvertGPUToGPUXPass());
// pm.addPass(imex::createConvertGPUToGPUXPass());
pm.addPass(createConvertSCFToCFPass());
pm.addPass(createConvertControlFlowToLLVMPass());
pm.addPass(createConvertVectorToLLVMPass());
pm.addPass(createConvertIndexToLLVMPass());
pm.addPass(createArithToLLVMConversionPass());
pm.addPass(createConvertFuncToLLVMPass());
pm.addPass(createConvertMathToLLVMPass());
pm.addPass(imex::createConvertGPUXToLLVMPass());
// pm.addPass(imex::createConvertGPUXToLLVMPass());
pm.addPass(createGpuToGpuOcl());
pm.addPass(createConvertIndexToLLVMPass());
pm.addPass(memref::createExpandStridedMetadataPass());
pm.addPass(createLowerAffinePass());
Expand Down
2 changes: 1 addition & 1 deletion lib/gc/Transforms/IterativeTilingAndFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ defaultTilingOfType(RewriterBase &rewriter, Operation *op,
} else {
defaultTileSize.resize(iteratorTypes.size(), rewriter.getIndexAttr(0));
// Try tileSize from `32` to `16`.
SmallVector<int64_t> tsOrder = {32, 16};
SmallVector<int64_t> tsOrder = {16, 32};
// Record how many dims have been tiled, including fully tiled, i.e.
// tileSize == dimSize.
unsigned nonOneTileDims =
Expand Down
2 changes: 2 additions & 0 deletions test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# GPUX is currently disabled
config.unsupported = True
63 changes: 63 additions & 0 deletions test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// RUN: gc-opt %s --gc-gpu-pipeline | FileCheck %s

module @test {
func.func @entry(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
%0 = bufferization.to_tensor %arg0 restrict : memref<32x32xf32>
%1 = bufferization.to_tensor %arg1 restrict : memref<32x32xf32>
%2 = tensor.empty() : tensor<32x32xf32>
%3 = linalg.add ins(%1, %0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%2 : tensor<32x32xf32>) -> tensor<32x32xf32>
bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
return
}
}

// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr

// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
// CHECK: [[BB1]]:
// CHECK: llvm.return [[NEW_PTR]]
// CHECK: [[BB2]]:
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
// CHECK: llvm.store [[NEW_PTR]], [[ARRAY]]
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
// CHECK: llvm.return [[OLD_PTR]]

// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
// CHECK: [[BB1]]:
// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
// CHECK: llvm.return [[NEW_PTR]]
// CHECK: [[BB2]]:
// CHECK: llvm.return [[PTR]]

// CHECK: llvm.func @entry
// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],

// CHECK: llvm.func @gcGpuOclKernelCreate
// CHECK: llvm.func @gcGpuOclKernelDestroy
// CHECK: llvm.func @gcGpuOclKernelLaunch


// CHECK: llvm.func @gcGpuOclModuleDestructor()
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
// CHECK: llvm.fence acquire
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
// CHECK: llvm.store [[PTR]], [[ARRAY]]
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
5 changes: 4 additions & 1 deletion test/mlir/test/gc/gpu-runner/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
if not config.gc_use_imex:
config.unsupported = True
config.unsupported = True
else:
# FIXME: Enable when the GPU runner is implemented.
config.excludes = ['mlp.mlir']

0 comments on commit 34415b1

Please sign in to comment.