From 774893dcd929c370bad714a70a7d670bb2d6f649 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 9 Oct 2024 10:40:49 -0500 Subject: [PATCH] [mlir][ROCDL] Plumb through AMDGPU memory access metadata (#110916) The LLVM backend has moved from function-wide attributes for making assurances about potentially unsafe atomic operations (like "unsafe-fp-atomics") to metadata on individual atomic operations. This commit adds support for generating this metadata from MLIR. --------- Co-authored-by: Quinn Dawkins --- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 1 + mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 9 +++++-- .../ROCDL/ROCDLToLLVMIRTranslation.cpp | 27 ++++++++++++++++++- mlir/test/Target/LLVMIR/rocdl.mlir | 23 ++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 80c22a357287ba..c298c8277eb0c3 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1071,6 +1071,7 @@ def LLVM_ConstantRangeAttr : LLVM_Attr<"ConstantRange", "constant_range"> { Syntax: ``` `<` `i`(width($lower)) $lower `,` $upper `>` + ``` }]; let builders = [ diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index aae2cf88ded041..b80d9ae88910c4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -58,7 +58,12 @@ def ROCDL_Dialect : Dialect { "::mlir::StringAttr":$flat_work_group_size, "::mlir::IntegerAttr":$max_flat_work_group_size, "::mlir::IntegerAttr":$waves_per_eu, - "::mlir::BoolAttr":$unsafe_fp_atomics + "::mlir::BoolAttr":$unsafe_fp_atomics, + // Correspond to LLVM metadata of the same name + "::mlir::UnitAttr":$last_use, + "::mlir::UnitAttr":$no_remote_memory, + "::mlir::UnitAttr":$no_fine_grained_memory, + "::mlir::UnitAttr":$ignore_denormal_mode ); let useDefaultAttributePrinterParser = 1; @@ -88,7 +93,7 @@ class ROCDL_IntrPure1Op : class ROCDL_IntrOp overloadedResults, list overloadedOperands, list traits, int numResults, - int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], + int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOpBase(attribute.getNameDialect()); + llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); if (dialect->getKernelAttrHelper().getName() == attribute.getName()) { auto func = dyn_cast(op); if (!func) @@ -198,7 +199,6 @@ class ROCDLDialectLLVMIRTranslationInterface if (!value) return op->emitOpError(Twine(attribute.getName()) + " must be a dense i32 array attribute"); - llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); SmallVector metadata; llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32); for (int32_t i : value.asArrayRef()) { @@ -210,6 +210,31 @@ class ROCDLDialectLLVMIRTranslationInterface llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata); llvmFunc->setMetadata("reqd_work_group_size", node); } + + // Atomic and nontemporal metadata + if (dialect->getLastUseAttrHelper().getName() == attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.last.use", llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getNoRemoteMemoryAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.no.remote.memory", + llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getNoFineGrainedMemoryAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.no.fine.grained.memory", + llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getIgnoreDenormalModeAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.ignore.denormal.mode", + llvm::MDNode::get(llvmContext, {})); + } + return success(); } }; diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 08c2d4e6477970..97276b087b7e93 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -564,11 +564,34 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 { } llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> { + // CHECK-LABEL: @rocdl_16bit_packed_floats // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}}) %source = rocdl.cvt.pkrtz %sourceA, %sourceB : vector<2xf16> llvm.return %source : vector<2xf16> } +llvm.func @rocdl_atomic_attrs(%ptr: !llvm.ptr<1>, %data: f32) { + // CHECK-LABEL: @rocdl_atomic_attrs + // CHECK: atomicrmw + // CHECK-SAME: !amdgpu.ignore.denormal.mode + // CHECK-SAME: !amdgpu.no.fine.grained.memory + // CHECK-SAME: !amdgpu.no.remote.memory + llvm.atomicrmw fadd %ptr, %data monotonic { + rocdl.ignore_denormal_mode, + rocdl.no_fine_grained_memory, + rocdl.no_remote_memory} : !llvm.ptr<1>, f32 + llvm.return +} + +llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 { + // CHECK-LABEL: @rocdl_last_use + // CHECK: %[[ret:.+]] = load + // CHECK-SAME: !amdgpu.last.use + // CHECK: ret i32 %[[ret]] + %ret = llvm.load %ptr {rocdl.last_use} : !llvm.ptr<1> -> i32 + llvm.return %ret : i32 +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"