From d74d00facdad5104bd8f60f9371995a7b57110cb Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Tue, 10 Sep 2024 11:53:47 +0200 Subject: [PATCH] [Depend-#333] Convert a subset of GPU dialect ops to the GPU OpenCL runtime calls --- .../GPURuntime/GpuOclRuntime.h | 29 + include/gc/Transforms/Passes.td | 7 + lib/gc/Transforms/GPU/CMakeLists.txt | 1 + lib/gc/Transforms/GPU/GpuToGpuOcl.cpp | 540 ++++++++++++++++++ 4 files changed, 577 insertions(+) create mode 100644 include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h create mode 100644 lib/gc/Transforms/GPU/GpuToGpuOcl.cpp diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h new file mode 100644 index 000000000..b01b9f2c6 --- /dev/null +++ b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h @@ -0,0 +1,29 @@ +//===-- GpuOclRuntime.h - GPU OpenCL runtime --------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GC_GPUOCLRUNTIME_H +#define GC_GPUOCLRUNTIME_H + +namespace mlir::gc::gpu { +constexpr char GPU_OCL_MALLOC[] = "gcGpuOclMalloc"; +constexpr char GPU_OCL_DEALLOC[] = "gcGpuOclDealloc"; +constexpr char GPU_OCL_MEMCPY[] = "gcGpuOclMemcpy"; +constexpr char GPU_OCL_KERNEL_CREATE[] = "gcGpuOclKernelCreate"; +constexpr char GPU_OCL_KERNEL_DESTROY[] = "gcGpuOclKernelDestroy"; +constexpr char GPU_OCL_KERNEL_LAUNCH[] = "gcGpuOclKernelLaunch"; +constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor"; +} // namespace mlir::gc::gpu + +#ifndef GC_GPU_OCL_CONST_ONLY + +// TBD + +#else +#undef GC_GPU_OCL_CONST_ONLY +#endif +#endif diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 5151a0335..6ba973361 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -93,6 +93,13 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> { "DPAS register block sizes MxNxK">, ]; } + +def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> { + let summary = "Convert the GPU operations to GpuOclRuntime calls."; + let description = [{ + Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls. + }]; +} #endif // GC_USE_IMEX def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion", diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt index 13f9c2981..5fd06e8db 100644 --- a/lib/gc/Transforms/GPU/CMakeLists.txt +++ b/lib/gc/Transforms/GPU/CMakeLists.txt @@ -1,4 +1,5 @@ gc_add_mlir_library(GcGpuPasses + GpuToGpuOcl.cpp LinalgToXeGPU.cpp Pipeline.cpp diff --git a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp new file mode 100644 index 000000000..e48025b0d --- /dev/null +++ b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp @@ -0,0 +1,540 @@ +//===-- GpuToGpuOcl.cpp - GpuToGpuOcl path ----------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +#define GC_GPU_OCL_CONST_ONLY +#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h" + +#include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +using namespace mlir; +using namespace mlir::gc::gpu; + +namespace mlir::gc { +#define GEN_PASS_DECL_GPUTOGPUOCL +#define GEN_PASS_DEF_GPUTOGPUOCL +#include "gc/Transforms/Passes.h.inc" +} // namespace mlir::gc + +namespace { +LLVM::CallOp funcCall(OpBuilder &builder, const StringRef name, + const Type returnType, const ArrayRef argTypes, + const Location loc, const ArrayRef arguments, + bool isVarArg = false) { + auto module = builder.getBlock()->getParent()->getParentOfType(); + auto function = module.lookupSymbol(name); + if (!function) { + auto type = LLVM::LLVMFunctionType::get(returnType, argTypes, isVarArg); + function = OpBuilder::atBlockEnd(module.getBody()) + .create(loc, name, type); + } + return builder.create(loc, function, arguments); +} + +// Assuming that the pointer to the context is passed as the last argument +// of the current function of type memref with zero dims. When lowering +// to LLVM, the memref arg is replaced with 3 args of types ptr, ptr, i64. +// Returning the first one. +Value getCtxPtr(const OpBuilder &rewriter) { + auto func = + rewriter.getBlock()->getParent()->getParentOfType(); + return func.getArgument(func.getNumArguments() - 3); +} + +struct Helper final { + LLVMTypeConverter &converter; + Type voidType; + Type ptrType; + Type idxType; + mutable std::unordered_set kernelNames; + + explicit Helper(MLIRContext *ctx, LLVMTypeConverter &converter) + : converter(converter), voidType(LLVM::LLVMVoidType::get(ctx)), + ptrType(LLVM::LLVMPointerType::get(ctx)), + idxType(IntegerType::get(ctx, converter.getPointerBitwidth())) {} + + Value idxConstant(OpBuilder &rewriter, const Location loc, + size_t value) const { + return rewriter.create( + loc, idxType, + rewriter.getIntegerAttr(idxType, static_cast(value))); + } + + void destroyKernels(OpBuilder &rewriter, Location loc, + ArrayRef kernelPtrs) const { + auto size = idxConstant(rewriter, loc, kernelPtrs.size()); + auto kernelPtrsArray = + rewriter.create(loc, ptrType, ptrType, size); + for (size_t i = 0, n = kernelPtrs.size(); i < n; i++) { + auto elementPtr = + rewriter.create(loc, ptrType, ptrType, kernelPtrsArray, + idxConstant(rewriter, loc, i)); + rewriter.create(loc, kernelPtrs[i], elementPtr); + } + + funcCall(rewriter, GPU_OCL_KERNEL_DESTROY, voidType, {idxType, ptrType}, + loc, {size, kernelPtrsArray}); + } +}; + +template +struct ConvertOpPattern : ConvertOpToLLVMPattern { + const Helper &helper; + + explicit ConvertOpPattern(const Helper &helper) + : ConvertOpToLLVMPattern(helper.converter), helper(helper) {} +}; + +struct ConvertAlloc final : ConvertOpPattern { + explicit ConvertAlloc(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = allocOp.getLoc(); + MemRefType type = allocOp.getType(); + auto shape = type.getShape(); + auto dynamics = adaptor.getDynamicSizes(); + + if (shape.empty() || dynamics.empty()) { + int64_t staticSize; + if (shape.empty()) { + staticSize = 0; + } else { + staticSize = type.getElementType().getIntOrFloatBitWidth() / 8; + for (auto dim : shape) { + assert(dim != ShapedType::kDynamic); + staticSize *= dim; + } + } + auto size = helper.idxConstant(rewriter, loc, staticSize); + auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + {helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), size}) + .getResult(); + Value replacement = MemRefDescriptor::fromStaticShape( + rewriter, loc, helper.converter, type, ptr, ptr); + rewriter.replaceOp(allocOp, replacement); + return success(); + } + + auto ndims = shape.size(); + SmallVector newShape; + SmallVector newStrides(ndims); + auto staticSize = type.getElementType().getIntOrFloatBitWidth() / 8; + auto size = dynamics[0]; + + auto idxMul = [&](Value x, Value y) -> Value { + if (auto xConst = getConstantIntValue(x)) { + if (auto yConst = getConstantIntValue(y)) { + return helper.idxConstant(rewriter, loc, + xConst.value() * yConst.value()); + } + } + return rewriter.create(loc, x, y); + }; + + for (size_t i = 0, j = 0; i < ndims; i++) { + auto dim = shape[i]; + if (dim == ShapedType::kDynamic) { + auto dynSize = dynamics[j++]; + newShape.emplace_back(dynSize); + if (j != 1) { + size = idxMul(size, dynSize); + } + } else { + staticSize *= dim; + newShape.emplace_back(helper.idxConstant(rewriter, loc, dim)); + } + } + + size = idxMul(size, helper.idxConstant(rewriter, loc, staticSize)); + auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + {helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), size}) + .getResult(); + + newStrides[ndims - 1] = helper.idxConstant(rewriter, loc, 1); + for (int i = static_cast(ndims) - 2; i >= 0; i--) { + newStrides[i] = idxMul(newStrides[i + 1], newShape[i]); + ; + } + + auto dsc = MemRefDescriptor::undef(rewriter, loc, + helper.converter.convertType(type)); + dsc.setAllocatedPtr(rewriter, loc, ptr); + dsc.setAlignedPtr(rewriter, loc, ptr); + dsc.setOffset(rewriter, loc, helper.idxConstant(rewriter, loc, 0)); + + for (unsigned i = 0, n = static_cast(ndims); i < n; i++) { + dsc.setSize(rewriter, loc, i, newShape[i]); + dsc.setStride(rewriter, loc, i, newStrides[i]); + } + + rewriter.replaceOp(allocOp, static_cast(dsc)); + return success(); + } +}; + +struct ConvertDealloc final : ConvertOpPattern { + explicit ConvertDealloc(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::DeallocOp gpuDealloc, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = gpuDealloc.getLoc(); + MemRefDescriptor dsc(adaptor.getMemref()); + auto ptr = dsc.allocatedPtr(rewriter, loc); + auto oclDealloc = funcCall(rewriter, GPU_OCL_DEALLOC, helper.voidType, + {helper.ptrType, helper.ptrType}, loc, + {getCtxPtr(rewriter), ptr}); + rewriter.replaceOp(gpuDealloc, oclDealloc); + return success(); + } +}; + +struct ConvertMemcpy final : ConvertOpPattern { + explicit ConvertMemcpy(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::MemcpyOp gpuMemcpy, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = gpuMemcpy.getLoc(); + auto srcType = gpuMemcpy.getSrc().getType(); + auto elementSize = srcType.getElementType().getIntOrFloatBitWidth() / 8; + uint64_t numElements = 0; + for (auto dim : srcType.getShape()) { + if (dim == ShapedType::kDynamic) { + gpuMemcpy.emitOpError() + << "dynamic shapes are not currently not supported"; + return failure(); + } + numElements = numElements ? numElements * dim : dim; + } + + MemRefDescriptor srcDsc(adaptor.getSrc()); + MemRefDescriptor dstDsc(adaptor.getDst()); + auto srcPtr = srcDsc.alignedPtr(rewriter, loc); + auto dstPtr = dstDsc.alignedPtr(rewriter, loc); + auto size = helper.idxConstant(rewriter, loc, elementSize * numElements); + auto oclMemcpy = funcCall( + rewriter, GPU_OCL_MEMCPY, helper.voidType, + {helper.ptrType, helper.ptrType, helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), srcPtr, dstPtr, size}); + rewriter.replaceOp(gpuMemcpy, oclMemcpy); + return success(); + } +}; + +struct ConvertLaunch final : ConvertOpPattern { + + explicit ConvertLaunch(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::LaunchFuncOp gpuLaunch, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto kernelPtr = getKernel(gpuLaunch, adaptor, rewriter); + if (!kernelPtr) { + return failure(); + } + + const Location loc = gpuLaunch.getLoc(); + auto kernelArgs = adaptor.getKernelOperands(); + SmallVector args; + args.reserve(kernelArgs.size() + 2); + args.emplace_back(getCtxPtr(rewriter)); + args.emplace_back(kernelPtr.value()); + + int i = 0; + for (auto arg : kernelArgs) { + if (auto type = gpuLaunch.getKernelOperand(i++).getType(); + isa(type)) { + MemRefDescriptor desc(arg); + args.emplace_back(desc.alignedPtr(rewriter, loc)); + } else { + // Store the arg on the stack and pass the pointer + auto ptr = rewriter.create( + loc, helper.ptrType, typeConverter->convertType(type), + helper.idxConstant(rewriter, loc, 1)); + rewriter.create(loc, arg, ptr); + args.emplace_back(ptr); + } + } + + const auto gpuOclLaunch = + funcCall(rewriter, GPU_OCL_KERNEL_LAUNCH, helper.voidType, + {helper.ptrType, helper.ptrType}, loc, args, true); + rewriter.replaceOp(gpuLaunch, gpuOclLaunch); + return success(); + } + +private: + // Returns the kernel pointer stored in the global var ...name_Ptr. + // If it's NULL, calls the createKernel() function. + std::optional getKernel(gpu::LaunchFuncOp &gpuLaunch, + OpAdaptor &adaptor, + ConversionPatternRewriter &rewriter) const { + auto loc = gpuLaunch.getLoc(); + auto ctx = getCtxPtr(rewriter); + auto mod = rewriter.getBlock()->getParent()->getParentOfType(); + auto kernelModName = gpuLaunch.getKernelModuleName(); + SmallString<128> getFuncName("getGcGpuOclKernel_"); + getFuncName.append(kernelModName); + + if (helper.kernelNames + .insert(std::string(kernelModName.begin(), kernelModName.end())) + .second) { + auto insPoint = rewriter.saveInsertionPoint(); + SmallString<128> strBuf("gcGpuOclKernel_"); + strBuf.append(kernelModName); + strBuf.append("_"); + auto strBufStart = strBuf.size(); + auto str = [&strBuf, + strBufStart](const char *chars) -> SmallString<128> & { + strBuf.truncate(strBufStart); + strBuf.append(chars); + return strBuf; + }; + + SmallString<128> createFuncName("createGcGpuOclKernel_"); + createFuncName.append(kernelModName); + if (!createKernel(gpuLaunch, adaptor, rewriter, loc, mod, createFuncName, + str)) { + return std::nullopt; + } + + auto function = rewriter.create( + loc, getFuncName, + LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}), + LLVM::Linkage::Internal); + rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter)); + + auto ptr = mod.lookupSymbol(str("Ptr")); + assert(ptr); + auto null = rewriter.create(loc, helper.ptrType); + auto ptrPtr = rewriter.create(loc, ptr); + auto ptrVal = rewriter.create(loc, helper.ptrType, ptrPtr); + auto cmp = rewriter.create(loc, LLVM::ICmpPredicate::eq, + ptrVal, null); + + auto body = &function.getBody(); + auto thenBlock = rewriter.createBlock(body); + auto elseBlock = rewriter.createBlock(body); + rewriter.setInsertionPointToEnd(&body->front()); + rewriter.create(loc, cmp, thenBlock, elseBlock); + + // Then block + rewriter.setInsertionPointToStart(thenBlock); + auto result = funcCall(rewriter, createFuncName, helper.ptrType, + {helper.ptrType}, loc, {function.getArgument(0)}); + rewriter.create(loc, result.getResult()); + + // Else block + rewriter.setInsertionPointToStart(elseBlock); + rewriter.create(loc, ptrVal); + + rewriter.restoreInsertionPoint(insPoint); + } + + auto kernelFunc = mod.lookupSymbol(getFuncName); + if (!kernelFunc) { + gpuLaunch.emitOpError() << "Function " << getFuncName << " not found!"; + return std::nullopt; + } + return rewriter.create(loc, kernelFunc, ValueRange(ctx)) + .getResult(); + } + + // Create a new kernel and save the pointer to the global variable + // ...name_Ptr. + bool createKernel( + gpu::LaunchFuncOp &gpuLaunch, OpAdaptor &adaptor, + ConversionPatternRewriter &rewriter, const Location &loc, ModuleOp &mod, + StringRef funcName, + const std::function &(const char *chars)> &str) const { + auto kernelModName = gpuLaunch.getKernelModuleName(); + auto kernelMod = SymbolTable::lookupNearestSymbolFrom( + gpuLaunch, kernelModName); + if (!kernelMod) { + gpuLaunch.emitOpError() << "Module " << kernelModName << " not found!"; + return false; + } + const auto binaryAttr = kernelMod->getAttrOfType("gpu.binary"); + if (!binaryAttr) { + kernelMod.emitOpError() << "missing 'gpu.binary' attribute"; + return false; + } + + rewriter.setInsertionPointToStart(mod.getBody()); + // The kernel pointer is stored here + rewriter.create(loc, helper.ptrType, /*isConstant=*/false, + LLVM::Linkage::Internal, str("Ptr"), + rewriter.getZeroAttr(helper.ptrType)); + rewriter.eraseOp(kernelMod); + + auto function = rewriter.create( + loc, funcName, + LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}), + LLVM::Linkage::Internal); + rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter)); + + auto ptr = mod.lookupSymbol(str("Ptr")); + assert(ptr); + SmallVector nameChars(kernelModName.getValue().begin(), + kernelModName.getValue().end()); + nameChars.emplace_back('\0'); + // Kernel name and SPIRV are stored as global strings + auto name = LLVM::createGlobalString( + loc, rewriter, str("Name"), + StringRef(nameChars.data(), nameChars.size()), LLVM::Linkage::Internal); + auto spirv = LLVM::createGlobalString(loc, rewriter, str("SPIRV"), + binaryAttr.getValue(), + LLVM::Linkage::Internal); + auto spirvSize = rewriter.create( + loc, helper.idxType, + IntegerAttr::get(helper.idxType, + static_cast(binaryAttr.size()))); + + SmallVector gridSize; + SmallVector blockSize; + SmallVector argSize; + gridSize.emplace_back(gpuLaunch.getGridSizeX()); + gridSize.emplace_back(gpuLaunch.getGridSizeY()); + gridSize.emplace_back(gpuLaunch.getGridSizeZ()); + blockSize.emplace_back(gpuLaunch.getBlockSizeX()); + blockSize.emplace_back(gpuLaunch.getBlockSizeY()); + blockSize.emplace_back(gpuLaunch.getBlockSizeZ()); + + for (auto arg : adaptor.getKernelOperands()) { + auto type = arg.getType(); + // Assuming, that the value is either an integer or a float or a pointer. + // In the latter case, the size is 0 bytes. + auto size = type.isIntOrFloat() ? type.getIntOrFloatBitWidth() / 8 : 0; + argSize.emplace_back(helper.idxConstant(rewriter, loc, size)); + } + + auto array = [&](SmallVector &values) { + auto size = helper.idxConstant(rewriter, loc, values.size()); + auto arrayPtr = rewriter.create(loc, helper.ptrType, + helper.idxType, size); + for (size_t i = 0, n = values.size(); i < n; i++) { + auto elementPtr = rewriter.create( + loc, helper.ptrType, helper.idxType, arrayPtr, + helper.idxConstant(rewriter, loc, i)); + auto value = values[i]; + if (auto cast = value.getDefiningOp()) { + assert(getConstantIntValue(cast.getOperand(0))); + value = helper.idxConstant( + rewriter, loc, getConstantIntValue(cast.getOperand(0)).value()); + } + rewriter.create(loc, value, elementPtr); + } + return arrayPtr.getResult(); + }; + + auto ctx = function.getArgument(0); + auto argNum = + helper.idxConstant(rewriter, loc, adaptor.getKernelOperands().size()); + auto createKernelCall = funcCall( + rewriter, GPU_OCL_KERNEL_CREATE, helper.ptrType, + {helper.ptrType, helper.idxType, helper.ptrType, helper.ptrType, + helper.ptrType, helper.ptrType, helper.idxType, helper.ptrType}, + loc, + {ctx, spirvSize, spirv, name, array(gridSize), array(blockSize), argNum, + array(argSize)}); + auto result = createKernelCall.getResult(); + + // Save the kernel pointer to the global var using CAS + auto null = rewriter.create(loc, helper.ptrType); + auto ptrPtr = rewriter.create(loc, ptr); + auto casResult = rewriter.create( + loc, ptrPtr, null, result, LLVM::AtomicOrdering::acq_rel, + LLVM::AtomicOrdering::monotonic); + auto casFlag = rewriter.create( + loc, rewriter.getI1Type(), casResult, 1); + + auto body = &function.getBody(); + auto thenBlock = rewriter.createBlock(body); + auto elseBlock = rewriter.createBlock(body); + rewriter.setInsertionPointToEnd(&body->front()); + rewriter.create(loc, casFlag, thenBlock, elseBlock); + + // Then block + rewriter.setInsertionPointToStart(thenBlock); + rewriter.create(loc, result); + + // Else block + // The kernel has already been created by another thread, destroying this + // one. + rewriter.setInsertionPointToStart(elseBlock); + helper.destroyKernels(rewriter, loc, result); + result = rewriter.create(loc, helper.ptrType, + casResult, 0); + rewriter.create(loc, result); + + rewriter.setInsertionPointAfter(function); + return true; + } +}; + +struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase { + + void runOnOperation() override { + const auto ctx = &getContext(); + const LLVMConversionTarget target(getContext()); + LLVMTypeConverter converter(ctx); + Helper helper(ctx, converter); + RewritePatternSet patterns(ctx); + + populateGpuToLLVMConversionPatterns(converter, patterns); + patterns.insert( + helper); + + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + signalPassFailure(); + return; + } + + // Add gpuOclDestructor() function that destroys all the kernels + auto mod = llvm::dyn_cast(getOperation()); + assert(mod); + OpBuilder rewriter(mod.getBody(), mod.getBody()->end()); + auto destruct = rewriter.create( + mod.getLoc(), GPU_OCL_MOD_DESTRUCTOR, + LLVM::LLVMFunctionType::get(helper.voidType, {}), + LLVM::Linkage::External); + auto loc = destruct.getLoc(); + rewriter.setInsertionPointToStart(destruct.addEntryBlock(rewriter)); + // Add memory fence + rewriter.create(loc, LLVM::AtomicOrdering::acquire); + + SmallVector kernelPtrs; + SmallString<128> strBuf("gcGpuOclKernel_"); + auto strBufStart = strBuf.size(); + kernelPtrs.reserve(helper.kernelNames.size()); + for (auto &name : helper.kernelNames) { + strBuf.truncate(strBufStart); + strBuf.append(name); + strBuf.append("_Ptr"); + auto ptr = mod.lookupSymbol(strBuf); + assert(ptr); + auto ptrVal = rewriter.create( + loc, helper.ptrType, rewriter.create(loc, ptr)); + kernelPtrs.emplace_back(ptrVal); + } + + helper.destroyKernels(rewriter, loc, kernelPtrs); + rewriter.create(loc, ValueRange{}); + } +}; +} // namespace \ No newline at end of file