From a01e900baf78f625d808e14784f414359f649d57 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 11 Oct 2024 08:50:49 +0200 Subject: [PATCH] [AMDGPU] Enable unaligned scratch accesses (#110219) This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 24 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 1037 ++- .../AMDGPU/GlobalISel/legalize-load-flat.mir | 3222 +------- .../GlobalISel/legalize-load-private.mir | 5246 +++++++------ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 28 +- .../test/CodeGen/AMDGPU/flat-address-space.ll | 12 +- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 98 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 2438 +----- .../AMDGPU/memcpy-param-combinations.ll | 6516 +++-------------- .../AMDGPU/memmove-param-combinations.ll | 5196 ++----------- llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 4 +- .../CodeGen/AMDGPU/unaligned-load-store.ll | 28 +- 15 files changed, 6082 insertions(+), 17791 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25117544d6a849..62fac085897ab6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, - FeatureVmemWriteVgprInOrder + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder ] >; @@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureVmemWriteVgprInOrder @@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureVmemWriteVgprInOrder ] @@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureTrue16BitInsts, + FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics ] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3f4f42377d56ee..d701bf037fdfa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && - ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 19458126093167..1ea3beb2855d69 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return UnalignedScratchAccess; } + bool hasUnalignedScratchAccessEnabled() const { + return UnalignedScratchAccess && UnalignedAccessMode; + } + bool hasUnalignedAccessMode() const { return UnalignedAccessMode; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3d8e03521e2b90..8c197f23149612 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( Subtarget->hasUnalignedDSAccessEnabled(); } - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; - - return AlignedBy4 || - Subtarget->enableFlatScratch() || - Subtarget->hasUnalignedScratchAccess(); - } - // FIXME: We have to be conservative here and assume that flat operations // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. - if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && - !Subtarget->hasUnalignedScratchAccess()) { + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS) { bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4; + return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); } // So long as they are correct, wide global memory operations perform better diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ce528467cd35b4..6e2e88f22600a8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -2428,11 +2428,54 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v4, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v3, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v3, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v8, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,30 +2484,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v3, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v3, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v4, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v3, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v3, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,12 +2631,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2572,59 +2755,293 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v12, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v10, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v10, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, s2 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, s0 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v10, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2635,16 +3052,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2742,64 +3200,382 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v13, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v14, v6, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v13, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v14, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v16, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v15, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v0, v6, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v11, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v10, v11, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v8, 4 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v14, v8, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v17, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v10, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v14, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v15, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v16, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v17, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v13, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v14, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v16, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v15, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v3, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v10, v11, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v14, v8, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v15, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v16, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v17, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v10, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v14, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v15, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v16, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v17, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2810,17 +3586,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 4 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index b1d7d36f9912e7..032ca7c0d4fee9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -483,40 +483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -664,40 +646,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -798,70 +762,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1247,76 +1163,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1485,130 +1347,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -2075,87 +1829,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2 @@ -2369,165 +2060,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1 @@ -3334,210 +2884,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX12-LABEL: name: test_load_flat_s128_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1 @@ -4132,133 +3496,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX12-LABEL: name: test_load_flat_p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4662,79 +3915,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4906,133 +4102,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5274,43 +4359,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5416,73 +4480,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5732,40 +4745,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v2s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6158,121 +5153,106 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX12-LABEL: name: test_load_flat_v3s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6503,40 +5483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6638,70 +5600,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7185,40 +6099,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7327,70 +6223,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -8291,36 +7139,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8334,36 +7168,22 @@ body: | ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8377,36 +7197,22 @@ body: | ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8765,70 +7571,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -9005,124 +7763,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -10686,133 +9342,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -11100,235 +9645,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -12078,342 +10410,42 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v3s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1 @@ -13306,441 +11338,33 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v4s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1 @@ -14762,210 +12386,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX12-LABEL: name: test_load_flat_v2p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1 @@ -15422,124 +12860,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 741f878c86f8b6..6d93112aae1a06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -636,27 +636,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s16_align1 ; GFX11: liveins: $vgpr0 @@ -702,15 +690,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -853,27 +853,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align2 ; GFX11: liveins: $vgpr0 @@ -919,15 +907,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -1012,47 +1012,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align1 ; GFX11: liveins: $vgpr0 @@ -1118,15 +1086,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -1529,39 +1529,27 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX10-LABEL: name: test_load_private_s24_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX11-LABEL: name: test_load_private_s24_align1 ; GFX11: liveins: $vgpr0 @@ -1631,27 +1619,39 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -2147,42 +2147,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align2 @@ -2245,15 +2225,51 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2386,78 +2402,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align1 @@ -2556,15 +2516,87 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2742,53 +2774,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2796,53 +2789,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2974,16 +2928,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -3381,28 +3427,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3410,28 +3442,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3513,16 +3531,58 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5) @@ -3701,53 +3761,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3755,53 +3776,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3933,16 +3915,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -4166,68 +4240,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4235,68 +4258,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4458,16 +4430,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -4928,35 +5022,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4964,35 +5040,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5088,16 +5146,72 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5) @@ -5321,68 +5435,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5390,68 +5453,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5613,16 +5625,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -5932,42 +6066,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align2 @@ -6030,15 +6144,53 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6171,78 +6323,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align1 @@ -6341,15 +6437,89 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6494,29 +6664,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align2 ; GFX11: liveins: $vgpr0 @@ -6564,15 +6720,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -6660,49 +6830,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align1 ; GFX11: liveins: $vgpr0 @@ -6770,15 +6906,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -6923,29 +7093,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align2 ; GFX11: liveins: $vgpr0 @@ -6993,15 +7149,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -7089,49 +7259,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align1 ; GFX11: liveins: $vgpr0 @@ -7199,15 +7335,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -7357,30 +7527,20 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s8_align1 @@ -7437,20 +7597,30 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5) @@ -7938,81 +8108,71 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX10-LABEL: name: test_load_private_v3s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11-LABEL: name: test_load_private_v3s8_align1 ; GFX11: liveins: $vgpr0 @@ -8168,71 +8328,81 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5) %2:_(s24) = G_BITCAST %1 @@ -8658,136 +8828,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v16s8_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v16s8_align16 @@ -8944,15 +9012,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5) %2:_(<4 x s32>) = G_BITCAST %1 @@ -9107,27 +9297,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align2 ; GFX11: liveins: $vgpr0 @@ -9173,15 +9351,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -9278,47 +9468,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align1 ; GFX11: liveins: $vgpr0 @@ -9384,15 +9542,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -9824,27 +10014,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -9853,27 +10042,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10215,41 +10403,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10258,41 +10431,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10445,22 +10603,36 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10474,22 +10646,36 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10827,44 +11013,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align2 @@ -10929,15 +11093,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11091,80 +11287,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align1 @@ -11265,15 +11403,83 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11582,42 +11788,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align2 @@ -11680,15 +11866,43 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11821,78 +12035,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align1 @@ -11991,15 +12149,79 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -12174,106 +12396,28 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX10-LABEL: name: test_load_private_v3s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX11-LABEL: name: test_load_private_v3s32_align16 @@ -12400,15 +12544,107 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -12764,136 +13000,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align16 @@ -13050,15 +13184,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13493,70 +13749,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align2 @@ -13647,15 +13867,71 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13875,136 +14151,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align1 @@ -14161,15 +14335,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -15262,68 +15558,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15331,68 +15576,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15552,15 +15746,155 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -18178,98 +18512,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18280,98 +18539,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18616,12 +18800,99 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18631,12 +18902,99 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18818,49 +19176,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18871,49 +19203,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -19060,12 +19366,50 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -19075,12 +19419,50 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab7f..3fc5d0d4b279eb 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm @@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: s_clause 0x1 -; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm @@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v0, off, off -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v3, off, off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 0ad53083d0ff3f..12593e3760fd3e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr } ; GCN-LABEL: flat_scratch_unaligned_load: -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} +; GFX9: flat_load_dword +; GFX10PLUS: flat_load_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr @@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { } ; GCN-LABEL: flat_scratch_unaligned_store: -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} +; GFX9: flat_store_dword +; GFX10PLUS: flat_store_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe5e..9d43efbdf07b1f 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v9, s7 ; CHECK-NEXT: v_mov_b32_e32 v8, s6 -; CHECK-NEXT: flat_load_ubyte v10, v[8:9] offset:5 -; CHECK-NEXT: flat_load_ubyte v11, v[8:9] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[8:9] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[8:9] offset:3 -; CHECK-NEXT: flat_load_ubyte v14, v[8:9] offset:2 -; CHECK-NEXT: flat_load_ubyte v15, v[8:9] offset:1 -; CHECK-NEXT: flat_load_ubyte v16, v[8:9] -; CHECK-NEXT: flat_load_ubyte v17, v[8:9] offset:4 -; CHECK-NEXT: flat_load_ubyte v18, v[8:9] offset:13 -; CHECK-NEXT: flat_load_ubyte v19, v[8:9] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[8:9] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[8:9] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[8:9] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[8:9] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[8:9] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[8:9] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, s7 +; CHECK-NEXT: v_add_co_u32_e32 v12, vcc, s6, v6 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[4:5], 2 -; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v7, vcc ; CHECK-NEXT: s_add_u32 s6, s6, 16 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[8:9], v13 offset:3 -; CHECK-NEXT: flat_store_byte v[8:9], v14 offset:2 -; CHECK-NEXT: flat_store_byte v[8:9], v15 offset:1 -; CHECK-NEXT: flat_store_byte v[8:9], v16 -; CHECK-NEXT: flat_store_byte v[8:9], v12 offset:7 -; CHECK-NEXT: flat_store_byte v[8:9], v11 offset:6 -; CHECK-NEXT: flat_store_byte v[8:9], v10 offset:5 -; CHECK-NEXT: flat_store_byte v[8:9], v17 offset:4 -; CHECK-NEXT: flat_store_byte v[8:9], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[8:9], v22 offset:10 -; CHECK-NEXT: flat_store_byte v[8:9], v23 offset:9 -; CHECK-NEXT: flat_store_byte v[8:9], v24 offset:8 -; CHECK-NEXT: flat_store_byte v[8:9], v20 offset:15 -; CHECK-NEXT: flat_store_byte v[8:9], v19 offset:14 -; CHECK-NEXT: flat_store_byte v[8:9], v18 offset:13 -; CHECK-NEXT: flat_store_byte v[8:9], v25 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual-header ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 ; CHECK-NEXT: v_mov_b32_e32 v11, s11 -; CHECK-NEXT: flat_load_ubyte v12, v[10:11] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[10:11] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[10:11] offset:7 -; CHECK-NEXT: flat_load_ubyte v15, v[10:11] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[10:11] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[10:11] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[10:11] -; CHECK-NEXT: flat_load_ubyte v19, v[10:11] offset:4 -; CHECK-NEXT: flat_load_ubyte v20, v[10:11] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[10:11] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[10:11] offset:15 -; CHECK-NEXT: flat_load_ubyte v23, v[10:11] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[10:11] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[10:11] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[10:11] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[10:11] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; CHECK-NEXT: v_mov_b32_e32 v15, s11 ; CHECK-NEXT: s_add_u32 s14, s14, 1 -; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s10, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_add_co_u32_e32 v14, vcc, s10, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: s_add_u32 s10, s10, 16 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 -; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 -; CHECK-NEXT: flat_store_byte v[10:11], v17 offset:1 -; CHECK-NEXT: flat_store_byte v[10:11], v18 -; CHECK-NEXT: flat_store_byte v[10:11], v14 offset:7 -; CHECK-NEXT: flat_store_byte v[10:11], v13 offset:6 -; CHECK-NEXT: flat_store_byte v[10:11], v12 offset:5 -; CHECK-NEXT: flat_store_byte v[10:11], v19 offset:4 -; CHECK-NEXT: flat_store_byte v[10:11], v23 offset:11 -; CHECK-NEXT: flat_store_byte v[10:11], v24 offset:10 -; CHECK-NEXT: flat_store_byte v[10:11], v25 offset:9 -; CHECK-NEXT: flat_store_byte v[10:11], v26 offset:8 -; CHECK-NEXT: flat_store_byte v[10:11], v22 offset:15 -; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 -; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 -; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_cbranch_execnz .LBB0_14 ; CHECK-NEXT: .LBB0_15: ; %Flow20 @@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB1_9 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c385..8c28fac0d839c2 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) @@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index 7575782c1b2acd..cadc3dadb0a1e9 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v17, v2 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v18, v2 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b128 v[3:6], v2 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:3 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:5 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:7 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:9 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:11 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ushort v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:10 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:14 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v5, v7, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v23, 8, v24 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v30 -; CHECK-NEXT: v_lshl_or_b32 v4, v14, 16, v12 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v33, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v2, 16, v18 ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v20, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v21, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v22, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v23, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v3, v21, 16, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v13, 16, v12 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v23, 16, v2 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v15 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v20, 16, v19 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v22, 16, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v8 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v22, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v29 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v32, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23 -; CHECK-NEXT: ds_write_b64 v0, v[3:4] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v19, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v20, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v21, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v22, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v11, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v18 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v22 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b128 v0, v[1:4] -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v10, v15, 8, v14 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v1, v13, 16, v12 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v9, v1, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) ret void -} - -define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { -; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +} + +define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false) @@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v10, v1 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false) @@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false) @@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index cc5256620bfe08..4e5688adcd6bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v2 -; CHECK-NEXT: ds_read_u8 v27, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v26 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v27, v2 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v33, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v27 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:5 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v12, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v16, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v15, v18, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v14, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v18, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v17, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v20, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v19, v28, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v4, v18, 16, v17 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v21, v30, 8, v31 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v22, v32, 8, v33 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v12, v12, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v20, 16, v19 -; CHECK-NEXT: v_lshl_or_b32 v2, v22, 16, v21 -; CHECK-NEXT: global_store_byte v[0:1], v13, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v12, off offset:28 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v7, v12, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v2, v18, 16, v19 -; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v20, off offset:30 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v4, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v8, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v13, v19, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v14, v24, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v27 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v18, v4, v32 -; CHECK-NEXT: v_lshl_or_b32 v4, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v33, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v8, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v7 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v19, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v11, 8, v11 -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v2, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v10, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v6, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v16, v25, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v3, v14, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v11, v11, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v18, 16, v17 -; CHECK-NEXT: ds_write_b8 v0, v12 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v13 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v11 offset:28 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v1, v12, 16, v13 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v16, 16, v17 -; CHECK-NEXT: ds_write_b16 v0, v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; CHECK-NEXT: ds_write_b8 v0, v18 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v3, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v7, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v4, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v16, v2, 16, v1 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v1, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v22, 8, v21 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v12, v23, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v17, v3, v31 -; CHECK-NEXT: v_lshl_or_b32 v3, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v32 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v16 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v17 offset:28 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v18, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: ds_write_b16 v0, v16 offset:28 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v17 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v18 offset:24 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v25, v1 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:9 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll index 94bc6d46b2395b..8ad6a4e534d232 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll @@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs ; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_add_f16_e32 v4, v6, v7 ; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2 +; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2 +; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll index 4e734d6e0884bc..fc33a274d7b11a 100644 --- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 { ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte define double @private_load_align1_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 1 ret double %x @@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 1 ret void @@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 { ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort define double @private_load_align2_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 2 ret double %x @@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 2 ret void