From cabb5d54cb7075ddd3aced60215c4f18b8912f4e Mon Sep 17 00:00:00 2001 From: Vikram Date: Thu, 27 Jun 2024 07:31:01 -0400 Subject: [PATCH] [AMDGPU] Enable atomic optimizer for 64 bit divergent values --- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 22 +- .../GlobalISel/global-atomic-fadd.f64.ll | 1346 +++++- .../atomic_optimizations_global_pointer.ll | 1038 ++++- .../atomic_optimizations_local_pointer.ll | 638 ++- .../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 1332 +++++- .../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 504 +- .../global_atomics_optimizer_fp_no_rtn.ll | 432 +- .../AMDGPU/global_atomics_scan_fadd.ll | 4054 +++++++++++----- .../AMDGPU/global_atomics_scan_fmax.ll | 2473 +++++++--- .../AMDGPU/global_atomics_scan_fmin.ll | 2473 +++++++--- .../AMDGPU/global_atomics_scan_fsub.ll | 4056 ++++++++++++----- 11 files changed, 14422 insertions(+), 3946 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index cdd1953dca4ec0..feffc3adb21b21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) { return Changed; } +static bool shouldOptimize(Type *Ty) { + switch (Ty->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + case Type::IntegerTyID: { + if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64) + return true; + default: + return false; + } + } +} + void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // Early exit for unhandled address space atomic instructions. switch (I.getPointerAddressSpace()) { @@ -230,8 +244,7 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { + if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType()))) { return; } @@ -313,8 +326,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { + if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType()))) { return; } @@ -745,7 +757,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // of each active lane in the wavefront. This will be our new value // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); + assert(TyBitWidth == 32 || TyBitWidth == 64); NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, {NewV, LastLaneIdx}); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index b058ad1023e130..8ad91f001bd72b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -1,249 +1,1219 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %25, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %35, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %22, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %24, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %34, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %16, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %21, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %28, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %38, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %27, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.13): + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %43.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %43.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %19, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[DEF]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %24, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]] + ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %44, %bb.5, [[DEF]], %bb.1 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.6 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.6 (%ir-block.40): + ; GFX90A_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %27, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %26, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.13): + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %42.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %42.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[DEF]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %23, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]] + ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %43, %bb.5, [[DEF]], %bb.1 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.6 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.6 (%ir-block.40): + ; GFX940_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index d3944d3d52d776..f0cec54691d5d8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2,12 +2,12 @@ ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1264 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1232 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -1744,87 +1744,440 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX89-LABEL: add_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: add_i64_varying: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX8-NEXT: s_mov_b32 m0, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8-NEXT: s_addc_u32 s5, s5, s7 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8-NEXT: s_cbranch_execz .LBB5_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: .LBB5_4: +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i64_varying: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX9-NEXT: s_mov_b32 m0, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9-NEXT: s_addc_u32 s5, s5, s7 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB5_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: add_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: add_i64_varying: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX1064-LABEL: add_i64_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX1064-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064-NEXT: s_add_u32 s4, s4, s7 +; GFX1064-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: .LBB5_4: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: add_i64_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1032-NEXT: s_add_u32 s4, s4, s6 +; GFX1032-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: .LBB5_4: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1164-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1164-NEXT: s_add_u32 s4, s4, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: .LBB5_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1132-NEXT: s_add_u32 s4, s4, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: .LBB5_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX1264-LABEL: add_i64_varying: +; GFX1264: ; %bb.0: ; %entry +; GFX1264-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1264-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264-NEXT: v_writelane_b32 v2, s5, s10 +; GFX1264-NEXT: v_writelane_b32 v1, s4, s10 +; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1264-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264-NEXT: s_cbranch_execz .LBB5_4 +; GFX1264-NEXT: ; %bb.3: +; GFX1264-NEXT: v_mov_b32_e32 v3, s4 +; GFX1264-NEXT: v_mov_b32_e32 v4, s5 +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: .LBB5_4: +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1264-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1264-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264-NEXT: s_nop 0 +; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264-NEXT: s_endpgm +; +; GFX1232-LABEL: add_i64_varying: +; GFX1232: ; %bb.0: ; %entry +; GFX1232-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1232-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1232-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1232-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1232-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1232-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232-NEXT: s_cbranch_execz .LBB5_4 +; GFX1232-NEXT: ; %bb.3: +; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 +; GFX1232-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: .LBB5_4: +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1232-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1232-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232-NEXT: s_nop 0 +; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3689,87 +4042,440 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX89-LABEL: sub_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: sub_i64_varying: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: .LBB11_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX8-NEXT: s_mov_b32 m0, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8-NEXT: s_addc_u32 s5, s5, s7 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8-NEXT: s_cbranch_execz .LBB11_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: .LBB11_4: +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i64_varying: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX9-NEXT: s_mov_b32 m0, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9-NEXT: s_addc_u32 s5, s5, s7 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB11_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: sub_i64_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3] +; GFX1064-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064-NEXT: s_add_u32 s4, s4, s7 +; GFX1064-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: .LBB11_4: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: sub_i64_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1032-NEXT: s_add_u32 s4, s4, s6 +; GFX1032-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: .LBB11_4: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1164-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1164-NEXT: s_add_u32 s4, s4, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: .LBB11_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm ; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: sub_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: sub_i64_varying: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX1132-LABEL: sub_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1132-NEXT: s_add_u32 s4, s4, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: .LBB11_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX1264-LABEL: sub_i64_varying: +; GFX1264: ; %bb.0: ; %entry +; GFX1264-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1264-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264-NEXT: v_writelane_b32 v2, s5, s10 +; GFX1264-NEXT: v_writelane_b32 v1, s4, s10 +; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1264-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264-NEXT: s_cbranch_execz .LBB11_4 +; GFX1264-NEXT: ; %bb.3: +; GFX1264-NEXT: v_mov_b32_e32 v3, s4 +; GFX1264-NEXT: v_mov_b32_e32 v4, s5 +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: .LBB11_4: +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1264-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264-NEXT: s_nop 0 +; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264-NEXT: s_endpgm +; +; GFX1232-LABEL: sub_i64_varying: +; GFX1232: ; %bb.0: ; %entry +; GFX1232-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1232-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1232-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1232-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1232-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1232-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232-NEXT: s_cbranch_execz .LBB11_4 +; GFX1232-NEXT: ; %bb.3: +; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 +; GFX1232-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: .LBB11_4: +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1232-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232-NEXT: s_nop 0 +; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b0b40aa952a9fb..453bd07647c73d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -1445,56 +1445,301 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i64_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: .LBB6_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8-NEXT: s_mov_b32 m0, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_u32 s2, s2, s8 +; GFX8-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: .LBB6_4: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i64_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9-NEXT: s_mov_b32 m0, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB6_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: add_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1064-LABEL: add_i64_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1064-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1064-NEXT: s_add_u32 s2, s2, s7 +; GFX1064-NEXT: s_addc_u32 s3, s3, s8 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB6_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: .LBB6_4: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: add_i64_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_mov_b64 s[2:3], 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032-NEXT: s_add_u32 s2, s2, s6 +; GFX1032-NEXT: s_addc_u32 s3, s3, s7 +; GFX1032-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032-NEXT: s_cbranch_execz .LBB6_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: .LBB6_4: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1164-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1164-NEXT: s_add_u32 s2, s2, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_addc_u32 s3, s3, s8 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: .LBB6_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132-NEXT: s_add_u32 s2, s2, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_addc_u32 s3, s3, s7 +; GFX1132-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: .LBB6_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -2972,56 +3217,301 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i64_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: .LBB13_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8-NEXT: s_mov_b32 m0, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_u32 s2, s2, s8 +; GFX8-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB13_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: .LBB13_4: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i64_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: .LBB13_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9-NEXT: s_mov_b32 m0, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: sub_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1064-LABEL: sub_i64_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: .LBB13_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1064-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1064-NEXT: s_add_u32 s2, s2, s7 +; GFX1064-NEXT: s_addc_u32 s3, s3, s8 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB13_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: .LBB13_4: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: sub_i64_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_mov_b64 s[2:3], 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: .LBB13_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032-NEXT: s_add_u32 s2, s2, s6 +; GFX1032-NEXT: s_addc_u32 s3, s3, s7 +; GFX1032-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032-NEXT: s_cbranch_execz .LBB13_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: .LBB13_4: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1164-NEXT: .LBB13_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1164-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1164-NEXT: s_add_u32 s2, s2, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_addc_u32 s3, s3, s8 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB13_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: .LBB13_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1132-NEXT: .LBB13_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132-NEXT: s_add_u32 s2, s2, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_addc_u32 s3, s3, s7 +; GFX1132-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132-NEXT: s_cbranch_execz .LBB13_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: .LBB13_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 9d8b987d2ba68c..60149b90cb0486 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -1,255 +1,1199 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %42 + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %26, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %41 + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %25, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %68 + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %77, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.13): + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %42, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]] + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]] + ; GFX90A_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %55:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %55, 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.40): + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[COPY17]] + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[COPY18]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %67 + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %76, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.13): + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %41, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]] + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]] + ; GFX940_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %54:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %54, 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.40): + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[COPY17]] + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[COPY18]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index 2397d6c4e8938b..166865b9b866fe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -990,9 +990,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd double [[TMP14]], [[TMP21:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = fadd double [[ACCUMULATOR]], [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd double [[TMP30]], [[TMP31]] +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret double %result @@ -1064,9 +1142,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -1138,9 +1294,87 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1178,9 +1412,87 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP21:%.*]]) +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.minnum.f64(double [[TMP30]], double [[TMP31]]) +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1244,9 +1556,87 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ } define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP30]], double [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1318,9 +1708,87 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP17]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP33]] +; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index d6edba001fb13c..8da8a9e9d3c61c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -852,9 +852,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = fadd double [[ACCUMULATOR]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret void @@ -914,9 +980,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret void @@ -976,9 +1108,75 @@ define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1010,9 +1208,75 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP16]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1064,9 +1328,75 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1126,9 +1456,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 04df04a5c299b3..b6990c8b842fdc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -5873,10 +5873,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -5891,24 +5891,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -5918,43 +5941,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -5964,115 +6010,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6083,10 +6204,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -6101,24 +6222,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6128,43 +6308,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6174,115 +6404,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 @@ -6820,10 +7206,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -6838,24 +7224,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -6865,43 +7274,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -6911,115 +7343,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7030,10 +7537,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7048,24 +7555,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7075,161 +7641,367 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: -; GFX1032-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -7767,10 +8539,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7785,24 +8557,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -7812,43 +8607,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -7858,129 +8676,204 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7995,24 +8888,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8022,43 +8974,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8068,115 +9070,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -8245,10 +9403,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -8263,24 +9421,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8290,43 +9471,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8336,115 +9540,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8455,10 +9734,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -8473,24 +9752,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8500,43 +9838,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8546,115 +9934,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9192,10 +10736,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9210,26 +10754,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_endpgm -; +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: +; GFX9-NEXT: s_endpgm +; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9237,43 +10804,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -9283,115 +10873,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -9402,10 +11067,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9420,24 +11085,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -9447,43 +11171,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -9493,115 +11267,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 9f27314cc39091..f512f17bbbcbfe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -3695,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -3713,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -3742,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1] +; GFX1064-NEXT: .LBB7_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -3774,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1] +; GFX1032-NEXT: .LBB7_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -3885,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -3903,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -3932,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB7_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -3964,115 +4224,296 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB7_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_endpgm - %divValue = call double @div.double.value() - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 - ret void -} - -define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { -; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 @@ -4490,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -4508,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -4537,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -4585,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -4712,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -4730,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -4759,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -4807,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -5349,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5367,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -5396,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -5444,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -5571,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5589,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -5618,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -5666,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index f16f61159fc303..c3b3079db3adc7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -3695,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -3713,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -3742,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1] +; GFX1064-NEXT: .LBB7_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -3774,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1] +; GFX1032-NEXT: .LBB7_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -3885,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -3903,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -3932,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB7_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -3964,115 +4224,296 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB7_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_endpgm - %divValue = call double @div.double.value() - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 - ret void -} - -define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { -; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 @@ -4490,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -4508,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -4537,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -4585,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -4712,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -4730,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -4759,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -4807,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -5349,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5367,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -5396,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -5444,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -5571,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5589,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -5618,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -5666,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 64650e2733a00d..8664fdf242036c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -6081,10 +6081,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -6099,24 +6099,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6126,43 +6149,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6172,115 +6218,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6291,10 +6412,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -6309,24 +6430,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6336,43 +6516,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6382,115 +6612,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 @@ -7027,10 +7413,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7045,24 +7431,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7072,43 +7481,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7118,115 +7550,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7237,10 +7744,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7255,24 +7762,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: @@ -7282,161 +7848,367 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -7974,10 +8746,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7992,24 +8764,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8019,43 +8814,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8065,129 +8883,204 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8202,24 +9095,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8229,43 +9181,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8275,115 +9277,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -8452,10 +9610,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -8470,24 +9628,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8497,43 +9678,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8543,115 +9747,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8662,10 +9941,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -8680,24 +9959,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8707,43 +10045,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: @@ -8753,115 +10141,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9398,10 +10942,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9416,24 +10960,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -9443,43 +11010,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -9489,115 +11079,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -9608,10 +11273,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9626,24 +11291,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -9653,43 +11377,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -9699,115 +11473,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8