-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Cleanup bitcast spam in atomic optimizer #96933
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Vikram Hegde (vikramRH) ChangesPatch is 339.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96933.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index d7ef6f3c5dc43..cdd1953dca4ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
Value *V,
Value *const Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce within each pair of rows (i.e. 32 lanes).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlanex16Call, AtomicTy));
+ V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
if (ST->isWave32()) {
return V;
}
if (ST->hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
- V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlane64Call, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
}
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
- V = B.CreateBitCast(V, IntNTy);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
- B.CreateBitCast(Lane32, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
}
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
AtomicRMWInst::BinOp Op, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
-
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -470,20 +461,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- Value *UpdateDPPCall =
- B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
- B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
- B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
+ Value *UpdateDPPCall = B.CreateCall(
+ UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
+ V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- V = B.CreateBitCast(V, IntNTy);
Value *const Lane31 = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -491,8 +479,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- UpdateDPPCall);
+ V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
}
}
return V;
@@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
-
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
- Function *WriteLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
@@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
// Copy the old lane 15 to the new lane 16.
- V = B.CreateCall(
- WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
- B.getInt32(16), B.CreateBitCast(V, IntNTy)});
- V = B.CreateBitCast(V, AtomicTy);
+ V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
+
if (!ST->isWave32()) {
// Copy the old lane 31 to the new lane 32.
- V = B.CreateBitCast(V, IntNTy);
- V = B.CreateCall(WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
- B.getInt32(31)}),
- B.getInt32(32), V});
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
// Copy the old lane 47 to the new lane 48.
V = B.CreateCall(
WriteLane,
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
- V = B.CreateBitCast(V, AtomicTy);
}
}
@@ -584,24 +564,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
auto *FF1 =
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
- auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
+ auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
// Get the value required for atomic operation
- V = B.CreateBitCast(V, IntNTy);
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
{V, LaneIdxInt});
- LaneValue = B.CreateBitCast(LaneValue, Ty);
// Perform writelane if intermediate scan results are required later in the
// kernel computations
Value *OldValue = nullptr;
if (NeedResult) {
- OldValue =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
- {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
- B.CreateBitCast(OldValuePhi, IntNTy)});
- OldValue = B.CreateBitCast(OldValue, Ty);
+ OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
+ {Accumulator, LaneIdxInt, OldValuePhi});
OldValuePhi->addIncoming(OldValue, ComputeLoop);
}
@@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
- auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- V = B.CreateBitCast(V, IntNTy);
- Identity = B.CreateBitCast(Identity, IntNTy);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
- {V, Identity});
- NewV = B.CreateBitCast(NewV, Ty);
- V = B.CreateBitCast(V, Ty);
- Identity = B.CreateBitCast(Identity, Ty);
+ NewV =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
if (!NeedResult && ST->hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
@@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
- NewV = B.CreateBitCast(NewV, IntNTy);
- NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
+ NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
- NewV = B.CreateBitCast(NewV, Ty);
}
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
Value *BroadcastI = nullptr;
- if (TyBitWidth == 64) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- BroadcastI =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
- BroadcastI = B.CreateBitCast(BroadcastI, Ty);
-
+ if (TyBitWidth == 32 || TyBitWidth == 64) {
+ BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 21832dc320e42..3f0b86c271538 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -169,30 +169,29 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
- ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
@@ -200,7 +199,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index e48d281f37c9a..676eae1bad85d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -171,35 +171,34 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEX...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Code looks much cleaner now.
5bc37d0
to
5423d96
Compare
Change-Id: I48b78d265985d38dbe270791c8c394a6a60199b3
Change-Id: I48b78d265985d38dbe270791c8c394a6a60199b3
No description provided.